xref: /aosp_15_r20/external/libyuv/source/rotate_win.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1*4e366538SXin Li /*
2*4e366538SXin Li  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li  *
4*4e366538SXin Li  *  Use of this source code is governed by a BSD-style license
5*4e366538SXin Li  *  that can be found in the LICENSE file in the root of the source
6*4e366538SXin Li  *  tree. An additional intellectual property rights grant can be found
7*4e366538SXin Li  *  in the file PATENTS. All contributing project authors may
8*4e366538SXin Li  *  be found in the AUTHORS file in the root of the source tree.
9*4e366538SXin Li  */
10*4e366538SXin Li 
11*4e366538SXin Li #include "libyuv/rotate_row.h"
12*4e366538SXin Li #include "libyuv/row.h"
13*4e366538SXin Li 
14*4e366538SXin Li #ifdef __cplusplus
15*4e366538SXin Li namespace libyuv {
16*4e366538SXin Li extern "C" {
17*4e366538SXin Li #endif
18*4e366538SXin Li 
19*4e366538SXin Li // This module is for 32 bit Visual C x86
20*4e366538SXin Li #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
21*4e366538SXin Li     !defined(__clang__) && defined(_M_IX86)
22*4e366538SXin Li 
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)23*4e366538SXin Li __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
24*4e366538SXin Li                                           int src_stride,
25*4e366538SXin Li                                           uint8_t* dst,
26*4e366538SXin Li                                           int dst_stride,
27*4e366538SXin Li                                           int width) {
28*4e366538SXin Li   __asm {
29*4e366538SXin Li     push      edi
30*4e366538SXin Li     push      esi
31*4e366538SXin Li     push      ebp
32*4e366538SXin Li     mov       eax, [esp + 12 + 4]  // src
33*4e366538SXin Li     mov       edi, [esp + 12 + 8]  // src_stride
34*4e366538SXin Li     mov       edx, [esp + 12 + 12]  // dst
35*4e366538SXin Li     mov       esi, [esp + 12 + 16]  // dst_stride
36*4e366538SXin Li     mov       ecx, [esp + 12 + 20]  // width
37*4e366538SXin Li 
38*4e366538SXin Li     // Read in the data from the source pointer.
39*4e366538SXin Li     // First round of bit swap.
40*4e366538SXin Li     align      4
41*4e366538SXin Li  convertloop:
42*4e366538SXin Li     movq      xmm0, qword ptr [eax]
43*4e366538SXin Li     lea       ebp, [eax + 8]
44*4e366538SXin Li     movq      xmm1, qword ptr [eax + edi]
45*4e366538SXin Li     lea       eax, [eax + 2 * edi]
46*4e366538SXin Li     punpcklbw xmm0, xmm1
47*4e366538SXin Li     movq      xmm2, qword ptr [eax]
48*4e366538SXin Li     movdqa    xmm1, xmm0
49*4e366538SXin Li     palignr   xmm1, xmm1, 8
50*4e366538SXin Li     movq      xmm3, qword ptr [eax + edi]
51*4e366538SXin Li     lea       eax, [eax + 2 * edi]
52*4e366538SXin Li     punpcklbw xmm2, xmm3
53*4e366538SXin Li     movdqa    xmm3, xmm2
54*4e366538SXin Li     movq      xmm4, qword ptr [eax]
55*4e366538SXin Li     palignr   xmm3, xmm3, 8
56*4e366538SXin Li     movq      xmm5, qword ptr [eax + edi]
57*4e366538SXin Li     punpcklbw xmm4, xmm5
58*4e366538SXin Li     lea       eax, [eax + 2 * edi]
59*4e366538SXin Li     movdqa    xmm5, xmm4
60*4e366538SXin Li     movq      xmm6, qword ptr [eax]
61*4e366538SXin Li     palignr   xmm5, xmm5, 8
62*4e366538SXin Li     movq      xmm7, qword ptr [eax + edi]
63*4e366538SXin Li     punpcklbw xmm6, xmm7
64*4e366538SXin Li     mov       eax, ebp
65*4e366538SXin Li     movdqa    xmm7, xmm6
66*4e366538SXin Li     palignr   xmm7, xmm7, 8
67*4e366538SXin Li     // Second round of bit swap.
68*4e366538SXin Li     punpcklwd xmm0, xmm2
69*4e366538SXin Li     punpcklwd xmm1, xmm3
70*4e366538SXin Li     movdqa    xmm2, xmm0
71*4e366538SXin Li     movdqa    xmm3, xmm1
72*4e366538SXin Li     palignr   xmm2, xmm2, 8
73*4e366538SXin Li     palignr   xmm3, xmm3, 8
74*4e366538SXin Li     punpcklwd xmm4, xmm6
75*4e366538SXin Li     punpcklwd xmm5, xmm7
76*4e366538SXin Li     movdqa    xmm6, xmm4
77*4e366538SXin Li     movdqa    xmm7, xmm5
78*4e366538SXin Li     palignr   xmm6, xmm6, 8
79*4e366538SXin Li     palignr   xmm7, xmm7, 8
80*4e366538SXin Li     // Third round of bit swap.
81*4e366538SXin Li     // Write to the destination pointer.
82*4e366538SXin Li     punpckldq xmm0, xmm4
83*4e366538SXin Li     movq      qword ptr [edx], xmm0
84*4e366538SXin Li     movdqa    xmm4, xmm0
85*4e366538SXin Li     palignr   xmm4, xmm4, 8
86*4e366538SXin Li     movq      qword ptr [edx + esi], xmm4
87*4e366538SXin Li     lea       edx, [edx + 2 * esi]
88*4e366538SXin Li     punpckldq xmm2, xmm6
89*4e366538SXin Li     movdqa    xmm6, xmm2
90*4e366538SXin Li     palignr   xmm6, xmm6, 8
91*4e366538SXin Li     movq      qword ptr [edx], xmm2
92*4e366538SXin Li     punpckldq xmm1, xmm5
93*4e366538SXin Li     movq      qword ptr [edx + esi], xmm6
94*4e366538SXin Li     lea       edx, [edx + 2 * esi]
95*4e366538SXin Li     movdqa    xmm5, xmm1
96*4e366538SXin Li     movq      qword ptr [edx], xmm1
97*4e366538SXin Li     palignr   xmm5, xmm5, 8
98*4e366538SXin Li     punpckldq xmm3, xmm7
99*4e366538SXin Li     movq      qword ptr [edx + esi], xmm5
100*4e366538SXin Li     lea       edx, [edx + 2 * esi]
101*4e366538SXin Li     movq      qword ptr [edx], xmm3
102*4e366538SXin Li     movdqa    xmm7, xmm3
103*4e366538SXin Li     palignr   xmm7, xmm7, 8
104*4e366538SXin Li     sub       ecx, 8
105*4e366538SXin Li     movq      qword ptr [edx + esi], xmm7
106*4e366538SXin Li     lea       edx, [edx + 2 * esi]
107*4e366538SXin Li     jg        convertloop
108*4e366538SXin Li 
109*4e366538SXin Li     pop       ebp
110*4e366538SXin Li     pop       esi
111*4e366538SXin Li     pop       edi
112*4e366538SXin Li     ret
113*4e366538SXin Li   }
114*4e366538SXin Li }
115*4e366538SXin Li 
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int w)116*4e366538SXin Li __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
117*4e366538SXin Li                                            int src_stride,
118*4e366538SXin Li                                            uint8_t* dst_a,
119*4e366538SXin Li                                            int dst_stride_a,
120*4e366538SXin Li                                            uint8_t* dst_b,
121*4e366538SXin Li                                            int dst_stride_b,
122*4e366538SXin Li                                            int w) {
123*4e366538SXin Li   __asm {
124*4e366538SXin Li     push      ebx
125*4e366538SXin Li     push      esi
126*4e366538SXin Li     push      edi
127*4e366538SXin Li     push      ebp
128*4e366538SXin Li     mov       eax, [esp + 16 + 4]  // src
129*4e366538SXin Li     mov       edi, [esp + 16 + 8]  // src_stride
130*4e366538SXin Li     mov       edx, [esp + 16 + 12]  // dst_a
131*4e366538SXin Li     mov       esi, [esp + 16 + 16]  // dst_stride_a
132*4e366538SXin Li     mov       ebx, [esp + 16 + 20]  // dst_b
133*4e366538SXin Li     mov       ebp, [esp + 16 + 24]  // dst_stride_b
134*4e366538SXin Li     mov       ecx, esp
135*4e366538SXin Li     sub       esp, 4 + 16
136*4e366538SXin Li     and       esp, ~15
137*4e366538SXin Li     mov       [esp + 16], ecx
138*4e366538SXin Li     mov       ecx, [ecx + 16 + 28]  // w
139*4e366538SXin Li 
140*4e366538SXin Li     align      4
141*4e366538SXin Li     // Read in the data from the source pointer.
142*4e366538SXin Li     // First round of bit swap.
143*4e366538SXin Li   convertloop:
144*4e366538SXin Li     movdqu    xmm0, [eax]
145*4e366538SXin Li     movdqu    xmm1, [eax + edi]
146*4e366538SXin Li     lea       eax, [eax + 2 * edi]
147*4e366538SXin Li     movdqa    xmm7, xmm0  // use xmm7 as temp register.
148*4e366538SXin Li     punpcklbw xmm0, xmm1
149*4e366538SXin Li     punpckhbw xmm7, xmm1
150*4e366538SXin Li     movdqa    xmm1, xmm7
151*4e366538SXin Li     movdqu    xmm2, [eax]
152*4e366538SXin Li     movdqu    xmm3, [eax + edi]
153*4e366538SXin Li     lea       eax, [eax + 2 * edi]
154*4e366538SXin Li     movdqa    xmm7, xmm2
155*4e366538SXin Li     punpcklbw xmm2, xmm3
156*4e366538SXin Li     punpckhbw xmm7, xmm3
157*4e366538SXin Li     movdqa    xmm3, xmm7
158*4e366538SXin Li     movdqu    xmm4, [eax]
159*4e366538SXin Li     movdqu    xmm5, [eax + edi]
160*4e366538SXin Li     lea       eax, [eax + 2 * edi]
161*4e366538SXin Li     movdqa    xmm7, xmm4
162*4e366538SXin Li     punpcklbw xmm4, xmm5
163*4e366538SXin Li     punpckhbw xmm7, xmm5
164*4e366538SXin Li     movdqa    xmm5, xmm7
165*4e366538SXin Li     movdqu    xmm6, [eax]
166*4e366538SXin Li     movdqu    xmm7, [eax + edi]
167*4e366538SXin Li     lea       eax, [eax + 2 * edi]
168*4e366538SXin Li     movdqu    [esp], xmm5  // backup xmm5
169*4e366538SXin Li     neg       edi
170*4e366538SXin Li     movdqa    xmm5, xmm6  // use xmm5 as temp register.
171*4e366538SXin Li     punpcklbw xmm6, xmm7
172*4e366538SXin Li     punpckhbw xmm5, xmm7
173*4e366538SXin Li     movdqa    xmm7, xmm5
174*4e366538SXin Li     lea       eax, [eax + 8 * edi + 16]
175*4e366538SXin Li     neg       edi
176*4e366538SXin Li         // Second round of bit swap.
177*4e366538SXin Li     movdqa    xmm5, xmm0
178*4e366538SXin Li     punpcklwd xmm0, xmm2
179*4e366538SXin Li     punpckhwd xmm5, xmm2
180*4e366538SXin Li     movdqa    xmm2, xmm5
181*4e366538SXin Li     movdqa    xmm5, xmm1
182*4e366538SXin Li     punpcklwd xmm1, xmm3
183*4e366538SXin Li     punpckhwd xmm5, xmm3
184*4e366538SXin Li     movdqa    xmm3, xmm5
185*4e366538SXin Li     movdqa    xmm5, xmm4
186*4e366538SXin Li     punpcklwd xmm4, xmm6
187*4e366538SXin Li     punpckhwd xmm5, xmm6
188*4e366538SXin Li     movdqa    xmm6, xmm5
189*4e366538SXin Li     movdqu    xmm5, [esp]  // restore xmm5
190*4e366538SXin Li     movdqu    [esp], xmm6  // backup xmm6
191*4e366538SXin Li     movdqa    xmm6, xmm5  // use xmm6 as temp register.
192*4e366538SXin Li     punpcklwd xmm5, xmm7
193*4e366538SXin Li     punpckhwd xmm6, xmm7
194*4e366538SXin Li     movdqa    xmm7, xmm6
195*4e366538SXin Li 
196*4e366538SXin Li         // Third round of bit swap.
197*4e366538SXin Li         // Write to the destination pointer.
198*4e366538SXin Li     movdqa    xmm6, xmm0
199*4e366538SXin Li     punpckldq xmm0, xmm4
200*4e366538SXin Li     punpckhdq xmm6, xmm4
201*4e366538SXin Li     movdqa    xmm4, xmm6
202*4e366538SXin Li     movdqu    xmm6, [esp]  // restore xmm6
203*4e366538SXin Li     movlpd    qword ptr [edx], xmm0
204*4e366538SXin Li     movhpd    qword ptr [ebx], xmm0
205*4e366538SXin Li     movlpd    qword ptr [edx + esi], xmm4
206*4e366538SXin Li     lea       edx, [edx + 2 * esi]
207*4e366538SXin Li     movhpd    qword ptr [ebx + ebp], xmm4
208*4e366538SXin Li     lea       ebx, [ebx + 2 * ebp]
209*4e366538SXin Li     movdqa    xmm0, xmm2  // use xmm0 as the temp register.
210*4e366538SXin Li     punpckldq xmm2, xmm6
211*4e366538SXin Li     movlpd    qword ptr [edx], xmm2
212*4e366538SXin Li     movhpd    qword ptr [ebx], xmm2
213*4e366538SXin Li     punpckhdq xmm0, xmm6
214*4e366538SXin Li     movlpd    qword ptr [edx + esi], xmm0
215*4e366538SXin Li     lea       edx, [edx + 2 * esi]
216*4e366538SXin Li     movhpd    qword ptr [ebx + ebp], xmm0
217*4e366538SXin Li     lea       ebx, [ebx + 2 * ebp]
218*4e366538SXin Li     movdqa    xmm0, xmm1  // use xmm0 as the temp register.
219*4e366538SXin Li     punpckldq xmm1, xmm5
220*4e366538SXin Li     movlpd    qword ptr [edx], xmm1
221*4e366538SXin Li     movhpd    qword ptr [ebx], xmm1
222*4e366538SXin Li     punpckhdq xmm0, xmm5
223*4e366538SXin Li     movlpd    qword ptr [edx + esi], xmm0
224*4e366538SXin Li     lea       edx, [edx + 2 * esi]
225*4e366538SXin Li     movhpd    qword ptr [ebx + ebp], xmm0
226*4e366538SXin Li     lea       ebx, [ebx + 2 * ebp]
227*4e366538SXin Li     movdqa    xmm0, xmm3  // use xmm0 as the temp register.
228*4e366538SXin Li     punpckldq xmm3, xmm7
229*4e366538SXin Li     movlpd    qword ptr [edx], xmm3
230*4e366538SXin Li     movhpd    qword ptr [ebx], xmm3
231*4e366538SXin Li     punpckhdq xmm0, xmm7
232*4e366538SXin Li     sub       ecx, 8
233*4e366538SXin Li     movlpd    qword ptr [edx + esi], xmm0
234*4e366538SXin Li     lea       edx, [edx + 2 * esi]
235*4e366538SXin Li     movhpd    qword ptr [ebx + ebp], xmm0
236*4e366538SXin Li     lea       ebx, [ebx + 2 * ebp]
237*4e366538SXin Li     jg        convertloop
238*4e366538SXin Li 
239*4e366538SXin Li     mov       esp, [esp + 16]
240*4e366538SXin Li     pop       ebp
241*4e366538SXin Li     pop       edi
242*4e366538SXin Li     pop       esi
243*4e366538SXin Li     pop       ebx
244*4e366538SXin Li     ret
245*4e366538SXin Li   }
246*4e366538SXin Li }
247*4e366538SXin Li 
248*4e366538SXin Li #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
249*4e366538SXin Li 
250*4e366538SXin Li #ifdef __cplusplus
251*4e366538SXin Li }  // extern "C"
252*4e366538SXin Li }  // namespace libyuv
253*4e366538SXin Li #endif
254