1 /*
2 * jdsample-neon.c - upsampling (Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
22 */
23
24 #define JPEG_INTERNALS
25 #include "../../jinclude.h"
26 #include "../../jpeglib.h"
27 #include "../../jsimd.h"
28 #include "../../jdct.h"
29 #include "../../jsimddct.h"
30 #include "../jsimd.h"
31
32 #include <arm_neon.h>
33
34
35 /* The diagram below shows a row of samples produced by h2v1 downsampling.
36 *
37 * s0 s1 s2
38 * +---------+---------+---------+
39 * | | | |
40 * | p0 p1 | p2 p3 | p4 p5 |
41 * | | | |
42 * +---------+---------+---------+
43 *
44 * Samples s0-s2 were created by averaging the original pixel component values
45 * centered at positions p0-p5 above. To approximate those original pixel
46 * component values, we proportionally blend the adjacent samples in each row.
47 *
48 * An upsampled pixel component value is computed by blending the sample
49 * containing the pixel center with the nearest neighboring sample, in the
50 * ratio 3:1. For example:
51 * p1(upsampled) = 3/4 * s0 + 1/4 * s1
52 * p2(upsampled) = 3/4 * s1 + 1/4 * s0
53 * When computing the first and last pixel component values in the row, there
54 * is no adjacent sample to blend, so:
55 * p0(upsampled) = s0
56 * p5(upsampled) = s2
57 */
58
jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)59 void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
60 JDIMENSION downsampled_width,
61 JSAMPARRAY input_data,
62 JSAMPARRAY *output_data_ptr)
63 {
64 JSAMPARRAY output_data = *output_data_ptr;
65 JSAMPROW inptr, outptr;
66 int inrow;
67 unsigned colctr;
68 /* Set up constants. */
69 const uint16x8_t one_u16 = vdupq_n_u16(1);
70 const uint8x8_t three_u8 = vdup_n_u8(3);
71
72 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
73 inptr = input_data[inrow];
74 outptr = output_data[inrow];
75 /* First pixel component value in this row of the original image */
76 *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
77
78 /* 3/4 * containing sample + 1/4 * nearest neighboring sample
79 * For p1: containing sample = s0, nearest neighboring sample = s1
80 * For p2: containing sample = s1, nearest neighboring sample = s0
81 */
82 uint8x16_t s0 = vld1q_u8(inptr);
83 uint8x16_t s1 = vld1q_u8(inptr + 1);
84 /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
85 * denote low half and high half respectively.
86 */
87 uint16x8_t s1_add_3s0_l =
88 vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
89 uint16x8_t s1_add_3s0_h =
90 vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
91 uint16x8_t s0_add_3s1_l =
92 vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
93 uint16x8_t s0_add_3s1_h =
94 vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
95 /* Add ordered dithering bias to odd pixel values. */
96 s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
97 s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
98
99 /* The offset is initially 1, because the first pixel component has already
100 * been stored. However, in subsequent iterations of the SIMD loop, this
101 * offset is (2 * colctr - 1) to stay within the bounds of the sample
102 * buffers without having to resort to a slow scalar tail case for the last
103 * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
104 * in jmemmgr.c for more details.
105 */
106 unsigned outptr_offset = 1;
107 uint8x16x2_t output_pixels;
108
109 /* We use software pipelining to maximise performance. The code indented
110 * an extra two spaces begins the next iteration of the loop.
111 */
112 for (colctr = 16; colctr < downsampled_width; colctr += 16) {
113
114 s0 = vld1q_u8(inptr + colctr - 1);
115 s1 = vld1q_u8(inptr + colctr);
116
117 /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
118 output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
119 vrshrn_n_u16(s1_add_3s0_h, 2));
120 output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
121 vshrn_n_u16(s0_add_3s1_h, 2));
122
123 /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
124 * denote low half and high half respectively.
125 */
126 s1_add_3s0_l =
127 vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
128 s1_add_3s0_h =
129 vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
130 s0_add_3s1_l =
131 vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
132 s0_add_3s1_h =
133 vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
134 /* Add ordered dithering bias to odd pixel values. */
135 s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
136 s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
137
138 /* Store pixel component values to memory. */
139 vst2q_u8(outptr + outptr_offset, output_pixels);
140 outptr_offset = 2 * colctr - 1;
141 }
142
143 /* Complete the last iteration of the loop. */
144
145 /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
146 output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
147 vrshrn_n_u16(s1_add_3s0_h, 2));
148 output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
149 vshrn_n_u16(s0_add_3s1_h, 2));
150 /* Store pixel component values to memory. */
151 vst2q_u8(outptr + outptr_offset, output_pixels);
152
153 /* Last pixel component value in this row of the original image */
154 outptr[2 * downsampled_width - 1] =
155 GETJSAMPLE(inptr[downsampled_width - 1]);
156 }
157 }
158
159
160 /* The diagram below shows an array of samples produced by h2v2 downsampling.
161 *
162 * s0 s1 s2
163 * +---------+---------+---------+
164 * | p0 p1 | p2 p3 | p4 p5 |
165 * sA | | | |
166 * | p6 p7 | p8 p9 | p10 p11|
167 * +---------+---------+---------+
168 * | p12 p13| p14 p15| p16 p17|
169 * sB | | | |
170 * | p18 p19| p20 p21| p22 p23|
171 * +---------+---------+---------+
172 * | p24 p25| p26 p27| p28 p29|
173 * sC | | | |
174 * | p30 p31| p32 p33| p34 p35|
175 * +---------+---------+---------+
176 *
177 * Samples s0A-s2C were created by averaging the original pixel component
178 * values centered at positions p0-p35 above. To approximate one of those
179 * original pixel component values, we proportionally blend the sample
180 * containing the pixel center with the nearest neighboring samples in each
181 * row, column, and diagonal.
182 *
183 * An upsampled pixel component value is computed by first blending the sample
184 * containing the pixel center with the nearest neighboring samples in the
185 * same column, in the ratio 3:1, and then blending each column sum with the
186 * nearest neighboring column sum, in the ratio 3:1. For example:
187 * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
188 * 1/4 * (3/4 * s0B + 1/4 * s0A)
189 * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
190 * When computing the first and last pixel component values in the row, there
191 * is no horizontally adjacent sample to blend, so:
192 * p12(upsampled) = 3/4 * s0B + 1/4 * s0A
193 * p23(upsampled) = 3/4 * s2B + 1/4 * s2C
194 * When computing the first and last pixel component values in the column,
195 * there is no vertically adjacent sample to blend, so:
196 * p2(upsampled) = 3/4 * s1A + 1/4 * s0A
197 * p33(upsampled) = 3/4 * s1C + 1/4 * s2C
198 * When computing the corner pixel component values, there is no adjacent
199 * sample to blend, so:
200 * p0(upsampled) = s0A
201 * p35(upsampled) = s2C
202 */
203
jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)204 void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
205 JDIMENSION downsampled_width,
206 JSAMPARRAY input_data,
207 JSAMPARRAY *output_data_ptr)
208 {
209 JSAMPARRAY output_data = *output_data_ptr;
210 JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
211 int inrow, outrow;
212 unsigned colctr;
213 /* Set up constants. */
214 const uint16x8_t seven_u16 = vdupq_n_u16(7);
215 const uint8x8_t three_u8 = vdup_n_u8(3);
216 const uint16x8_t three_u16 = vdupq_n_u16(3);
217
218 inrow = outrow = 0;
219 while (outrow < max_v_samp_factor) {
220 inptr0 = input_data[inrow - 1];
221 inptr1 = input_data[inrow];
222 inptr2 = input_data[inrow + 1];
223 /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
224 * respectively.
225 */
226 outptr0 = output_data[outrow++];
227 outptr1 = output_data[outrow++];
228
229 /* First pixel component value in this row of the original image */
230 int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
231 *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
232 int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
233 *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
234
235 /* Step 1: Blend samples vertically in columns s0 and s1.
236 * Leave the divide by 4 until the end, when it can be done for both
237 * dimensions at once, right-shifting by 4.
238 */
239
240 /* Load and compute s0colsum0 and s0colsum1. */
241 uint8x16_t s0A = vld1q_u8(inptr0);
242 uint8x16_t s0B = vld1q_u8(inptr1);
243 uint8x16_t s0C = vld1q_u8(inptr2);
244 /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
245 * denote low half and high half respectively.
246 */
247 uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
248 vget_low_u8(s0B), three_u8);
249 uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
250 vget_high_u8(s0B), three_u8);
251 uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
252 vget_low_u8(s0B), three_u8);
253 uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
254 vget_high_u8(s0B), three_u8);
255 /* Load and compute s1colsum0 and s1colsum1. */
256 uint8x16_t s1A = vld1q_u8(inptr0 + 1);
257 uint8x16_t s1B = vld1q_u8(inptr1 + 1);
258 uint8x16_t s1C = vld1q_u8(inptr2 + 1);
259 uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
260 vget_low_u8(s1B), three_u8);
261 uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
262 vget_high_u8(s1B), three_u8);
263 uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
264 vget_low_u8(s1B), three_u8);
265 uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
266 vget_high_u8(s1B), three_u8);
267
268 /* Step 2: Blend the already-blended columns. */
269
270 uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
271 uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
272 uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
273 uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
274 uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
275 uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
276 uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
277 uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
278 /* Add ordered dithering bias to odd pixel values. */
279 output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
280 output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
281 output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
282 output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
283 /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
284 uint8x16x2_t output_pixels0 = { {
285 vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
286 vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
287 } };
288 uint8x16x2_t output_pixels1 = { {
289 vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
290 vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
291 } };
292
293 /* Store pixel component values to memory.
294 * The minimum size of the output buffer for each row is 64 bytes => no
295 * need to worry about buffer overflow here. See "Creation of 2-D sample
296 * arrays" in jmemmgr.c for more details.
297 */
298 vst2q_u8(outptr0 + 1, output_pixels0);
299 vst2q_u8(outptr1 + 1, output_pixels1);
300
301 /* The first pixel of the image shifted our loads and stores by one byte.
302 * We have to re-align on a 32-byte boundary at some point before the end
303 * of the row (we do it now on the 32/33 pixel boundary) to stay within the
304 * bounds of the sample buffers without having to resort to a slow scalar
305 * tail case for the last (downsampled_width % 16) samples. See "Creation
306 * of 2-D sample arrays" in jmemmgr.c for more details.
307 */
308 for (colctr = 16; colctr < downsampled_width; colctr += 16) {
309 /* Step 1: Blend samples vertically in columns s0 and s1. */
310
311 /* Load and compute s0colsum0 and s0colsum1. */
312 s0A = vld1q_u8(inptr0 + colctr - 1);
313 s0B = vld1q_u8(inptr1 + colctr - 1);
314 s0C = vld1q_u8(inptr2 + colctr - 1);
315 s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
316 three_u8);
317 s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
318 three_u8);
319 s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
320 three_u8);
321 s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
322 three_u8);
323 /* Load and compute s1colsum0 and s1colsum1. */
324 s1A = vld1q_u8(inptr0 + colctr);
325 s1B = vld1q_u8(inptr1 + colctr);
326 s1C = vld1q_u8(inptr2 + colctr);
327 s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
328 three_u8);
329 s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
330 three_u8);
331 s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
332 three_u8);
333 s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
334 three_u8);
335
336 /* Step 2: Blend the already-blended columns. */
337
338 output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
339 output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
340 output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
341 output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
342 output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
343 output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
344 output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
345 output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
346 /* Add ordered dithering bias to odd pixel values. */
347 output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
348 output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
349 output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
350 output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
351 /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
352 output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
353 vshrn_n_u16(output0_p1_h, 4));
354 output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
355 vrshrn_n_u16(output0_p2_h, 4));
356 output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
357 vshrn_n_u16(output1_p1_h, 4));
358 output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
359 vrshrn_n_u16(output1_p2_h, 4));
360 /* Store pixel component values to memory. */
361 vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
362 vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
363 }
364
365 /* Last pixel component value in this row of the original image */
366 int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
367 GETJSAMPLE(inptr0[downsampled_width - 1]);
368 outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
369 int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
370 GETJSAMPLE(inptr2[downsampled_width - 1]);
371 outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
372 inrow++;
373 }
374 }
375
376
377 /* The diagram below shows a column of samples produced by h1v2 downsampling
378 * (or by losslessly rotating or transposing an h2v1-downsampled image.)
379 *
380 * +---------+
381 * | p0 |
382 * sA | |
383 * | p1 |
384 * +---------+
385 * | p2 |
386 * sB | |
387 * | p3 |
388 * +---------+
389 * | p4 |
390 * sC | |
391 * | p5 |
392 * +---------+
393 *
394 * Samples sA-sC were created by averaging the original pixel component values
395 * centered at positions p0-p5 above. To approximate those original pixel
396 * component values, we proportionally blend the adjacent samples in each
397 * column.
398 *
399 * An upsampled pixel component value is computed by blending the sample
400 * containing the pixel center with the nearest neighboring sample, in the
401 * ratio 3:1. For example:
402 * p1(upsampled) = 3/4 * sA + 1/4 * sB
403 * p2(upsampled) = 3/4 * sB + 1/4 * sA
404 * When computing the first and last pixel component values in the column,
405 * there is no adjacent sample to blend, so:
406 * p0(upsampled) = sA
407 * p5(upsampled) = sC
408 */
409
jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)410 void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
411 JDIMENSION downsampled_width,
412 JSAMPARRAY input_data,
413 JSAMPARRAY *output_data_ptr)
414 {
415 JSAMPARRAY output_data = *output_data_ptr;
416 JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
417 int inrow, outrow;
418 unsigned colctr;
419 /* Set up constants. */
420 const uint16x8_t one_u16 = vdupq_n_u16(1);
421 const uint8x8_t three_u8 = vdup_n_u8(3);
422
423 inrow = outrow = 0;
424 while (outrow < max_v_samp_factor) {
425 inptr0 = input_data[inrow - 1];
426 inptr1 = input_data[inrow];
427 inptr2 = input_data[inrow + 1];
428 /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
429 * respectively.
430 */
431 outptr0 = output_data[outrow++];
432 outptr1 = output_data[outrow++];
433 inrow++;
434
435 /* The size of the input and output buffers is always a multiple of 32
436 * bytes => no need to worry about buffer overflow when reading/writing
437 * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
438 * details.
439 */
440 for (colctr = 0; colctr < downsampled_width; colctr += 16) {
441 /* Load samples. */
442 uint8x16_t sA = vld1q_u8(inptr0 + colctr);
443 uint8x16_t sB = vld1q_u8(inptr1 + colctr);
444 uint8x16_t sC = vld1q_u8(inptr2 + colctr);
445 /* Blend samples vertically. */
446 uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
447 vget_low_u8(sB), three_u8);
448 uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
449 vget_high_u8(sB), three_u8);
450 uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
451 vget_low_u8(sB), three_u8);
452 uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
453 vget_high_u8(sB), three_u8);
454 /* Add ordered dithering bias to pixel values in even output rows. */
455 colsum0_l = vaddq_u16(colsum0_l, one_u16);
456 colsum0_h = vaddq_u16(colsum0_h, one_u16);
457 /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
458 uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
459 vshrn_n_u16(colsum0_h, 2));
460 uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
461 vrshrn_n_u16(colsum1_h, 2));
462 /* Store pixel component values to memory. */
463 vst1q_u8(outptr0 + colctr, output_pixels0);
464 vst1q_u8(outptr1 + colctr, output_pixels1);
465 }
466 }
467 }
468
469
470 /* The diagram below shows a row of samples produced by h2v1 downsampling.
471 *
472 * s0 s1
473 * +---------+---------+
474 * | | |
475 * | p0 p1 | p2 p3 |
476 * | | |
477 * +---------+---------+
478 *
479 * Samples s0 and s1 were created by averaging the original pixel component
480 * values centered at positions p0-p3 above. To approximate those original
481 * pixel component values, we duplicate the samples horizontally:
482 * p0(upsampled) = p1(upsampled) = s0
483 * p2(upsampled) = p3(upsampled) = s1
484 */
485
jsimd_h2v1_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)486 void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
487 JSAMPARRAY input_data,
488 JSAMPARRAY *output_data_ptr)
489 {
490 JSAMPARRAY output_data = *output_data_ptr;
491 JSAMPROW inptr, outptr;
492 int inrow;
493 unsigned colctr;
494
495 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
496 inptr = input_data[inrow];
497 outptr = output_data[inrow];
498 for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
499 uint8x16_t samples = vld1q_u8(inptr + colctr);
500 /* Duplicate the samples. The store operation below interleaves them so
501 * that adjacent pixel component values take on the same sample value,
502 * per above.
503 */
504 uint8x16x2_t output_pixels = { { samples, samples } };
505 /* Store pixel component values to memory.
506 * Due to the way sample buffers are allocated, we don't need to worry
507 * about tail cases when output_width is not a multiple of 32. See
508 * "Creation of 2-D sample arrays" in jmemmgr.c for details.
509 */
510 vst2q_u8(outptr + 2 * colctr, output_pixels);
511 }
512 }
513 }
514
515
516 /* The diagram below shows an array of samples produced by h2v2 downsampling.
517 *
518 * s0 s1
519 * +---------+---------+
520 * | p0 p1 | p2 p3 |
521 * sA | | |
522 * | p4 p5 | p6 p7 |
523 * +---------+---------+
524 * | p8 p9 | p10 p11|
525 * sB | | |
526 * | p12 p13| p14 p15|
527 * +---------+---------+
528 *
529 * Samples s0A-s1B were created by averaging the original pixel component
530 * values centered at positions p0-p15 above. To approximate those original
531 * pixel component values, we duplicate the samples both horizontally and
532 * vertically:
533 * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
534 * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
535 * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
536 * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
537 */
538
jsimd_h2v2_upsample_neon(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)539 void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
540 JSAMPARRAY input_data,
541 JSAMPARRAY *output_data_ptr)
542 {
543 JSAMPARRAY output_data = *output_data_ptr;
544 JSAMPROW inptr, outptr0, outptr1;
545 int inrow, outrow;
546 unsigned colctr;
547
548 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
549 inptr = input_data[inrow];
550 outptr0 = output_data[outrow++];
551 outptr1 = output_data[outrow++];
552
553 for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
554 uint8x16_t samples = vld1q_u8(inptr + colctr);
555 /* Duplicate the samples. The store operation below interleaves them so
556 * that adjacent pixel component values take on the same sample value,
557 * per above.
558 */
559 uint8x16x2_t output_pixels = { { samples, samples } };
560 /* Store pixel component values for both output rows to memory.
561 * Due to the way sample buffers are allocated, we don't need to worry
562 * about tail cases when output_width is not a multiple of 32. See
563 * "Creation of 2-D sample arrays" in jmemmgr.c for details.
564 */
565 vst2q_u8(outptr0 + 2 * colctr, output_pixels);
566 vst2q_u8(outptr1 + 2 * colctr, output_pixels);
567 }
568 }
569 }
570