xref: /aosp_15_r20/external/libyuv/source/rotate_win.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for 32 bit Visual C x86
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
21     !defined(__clang__) && defined(_M_IX86)
22 
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)23 __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
24                                           int src_stride,
25                                           uint8_t* dst,
26                                           int dst_stride,
27                                           int width) {
28   __asm {
29     push      edi
30     push      esi
31     push      ebp
32     mov       eax, [esp + 12 + 4]  // src
33     mov       edi, [esp + 12 + 8]  // src_stride
34     mov       edx, [esp + 12 + 12]  // dst
35     mov       esi, [esp + 12 + 16]  // dst_stride
36     mov       ecx, [esp + 12 + 20]  // width
37 
38     // Read in the data from the source pointer.
39     // First round of bit swap.
40     align      4
41  convertloop:
42     movq      xmm0, qword ptr [eax]
43     lea       ebp, [eax + 8]
44     movq      xmm1, qword ptr [eax + edi]
45     lea       eax, [eax + 2 * edi]
46     punpcklbw xmm0, xmm1
47     movq      xmm2, qword ptr [eax]
48     movdqa    xmm1, xmm0
49     palignr   xmm1, xmm1, 8
50     movq      xmm3, qword ptr [eax + edi]
51     lea       eax, [eax + 2 * edi]
52     punpcklbw xmm2, xmm3
53     movdqa    xmm3, xmm2
54     movq      xmm4, qword ptr [eax]
55     palignr   xmm3, xmm3, 8
56     movq      xmm5, qword ptr [eax + edi]
57     punpcklbw xmm4, xmm5
58     lea       eax, [eax + 2 * edi]
59     movdqa    xmm5, xmm4
60     movq      xmm6, qword ptr [eax]
61     palignr   xmm5, xmm5, 8
62     movq      xmm7, qword ptr [eax + edi]
63     punpcklbw xmm6, xmm7
64     mov       eax, ebp
65     movdqa    xmm7, xmm6
66     palignr   xmm7, xmm7, 8
67     // Second round of bit swap.
68     punpcklwd xmm0, xmm2
69     punpcklwd xmm1, xmm3
70     movdqa    xmm2, xmm0
71     movdqa    xmm3, xmm1
72     palignr   xmm2, xmm2, 8
73     palignr   xmm3, xmm3, 8
74     punpcklwd xmm4, xmm6
75     punpcklwd xmm5, xmm7
76     movdqa    xmm6, xmm4
77     movdqa    xmm7, xmm5
78     palignr   xmm6, xmm6, 8
79     palignr   xmm7, xmm7, 8
80     // Third round of bit swap.
81     // Write to the destination pointer.
82     punpckldq xmm0, xmm4
83     movq      qword ptr [edx], xmm0
84     movdqa    xmm4, xmm0
85     palignr   xmm4, xmm4, 8
86     movq      qword ptr [edx + esi], xmm4
87     lea       edx, [edx + 2 * esi]
88     punpckldq xmm2, xmm6
89     movdqa    xmm6, xmm2
90     palignr   xmm6, xmm6, 8
91     movq      qword ptr [edx], xmm2
92     punpckldq xmm1, xmm5
93     movq      qword ptr [edx + esi], xmm6
94     lea       edx, [edx + 2 * esi]
95     movdqa    xmm5, xmm1
96     movq      qword ptr [edx], xmm1
97     palignr   xmm5, xmm5, 8
98     punpckldq xmm3, xmm7
99     movq      qword ptr [edx + esi], xmm5
100     lea       edx, [edx + 2 * esi]
101     movq      qword ptr [edx], xmm3
102     movdqa    xmm7, xmm3
103     palignr   xmm7, xmm7, 8
104     sub       ecx, 8
105     movq      qword ptr [edx + esi], xmm7
106     lea       edx, [edx + 2 * esi]
107     jg        convertloop
108 
109     pop       ebp
110     pop       esi
111     pop       edi
112     ret
113   }
114 }
115 
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int w)116 __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
117                                            int src_stride,
118                                            uint8_t* dst_a,
119                                            int dst_stride_a,
120                                            uint8_t* dst_b,
121                                            int dst_stride_b,
122                                            int w) {
123   __asm {
124     push      ebx
125     push      esi
126     push      edi
127     push      ebp
128     mov       eax, [esp + 16 + 4]  // src
129     mov       edi, [esp + 16 + 8]  // src_stride
130     mov       edx, [esp + 16 + 12]  // dst_a
131     mov       esi, [esp + 16 + 16]  // dst_stride_a
132     mov       ebx, [esp + 16 + 20]  // dst_b
133     mov       ebp, [esp + 16 + 24]  // dst_stride_b
134     mov       ecx, esp
135     sub       esp, 4 + 16
136     and       esp, ~15
137     mov       [esp + 16], ecx
138     mov       ecx, [ecx + 16 + 28]  // w
139 
140     align      4
141     // Read in the data from the source pointer.
142     // First round of bit swap.
143   convertloop:
144     movdqu    xmm0, [eax]
145     movdqu    xmm1, [eax + edi]
146     lea       eax, [eax + 2 * edi]
147     movdqa    xmm7, xmm0  // use xmm7 as temp register.
148     punpcklbw xmm0, xmm1
149     punpckhbw xmm7, xmm1
150     movdqa    xmm1, xmm7
151     movdqu    xmm2, [eax]
152     movdqu    xmm3, [eax + edi]
153     lea       eax, [eax + 2 * edi]
154     movdqa    xmm7, xmm2
155     punpcklbw xmm2, xmm3
156     punpckhbw xmm7, xmm3
157     movdqa    xmm3, xmm7
158     movdqu    xmm4, [eax]
159     movdqu    xmm5, [eax + edi]
160     lea       eax, [eax + 2 * edi]
161     movdqa    xmm7, xmm4
162     punpcklbw xmm4, xmm5
163     punpckhbw xmm7, xmm5
164     movdqa    xmm5, xmm7
165     movdqu    xmm6, [eax]
166     movdqu    xmm7, [eax + edi]
167     lea       eax, [eax + 2 * edi]
168     movdqu    [esp], xmm5  // backup xmm5
169     neg       edi
170     movdqa    xmm5, xmm6  // use xmm5 as temp register.
171     punpcklbw xmm6, xmm7
172     punpckhbw xmm5, xmm7
173     movdqa    xmm7, xmm5
174     lea       eax, [eax + 8 * edi + 16]
175     neg       edi
176         // Second round of bit swap.
177     movdqa    xmm5, xmm0
178     punpcklwd xmm0, xmm2
179     punpckhwd xmm5, xmm2
180     movdqa    xmm2, xmm5
181     movdqa    xmm5, xmm1
182     punpcklwd xmm1, xmm3
183     punpckhwd xmm5, xmm3
184     movdqa    xmm3, xmm5
185     movdqa    xmm5, xmm4
186     punpcklwd xmm4, xmm6
187     punpckhwd xmm5, xmm6
188     movdqa    xmm6, xmm5
189     movdqu    xmm5, [esp]  // restore xmm5
190     movdqu    [esp], xmm6  // backup xmm6
191     movdqa    xmm6, xmm5  // use xmm6 as temp register.
192     punpcklwd xmm5, xmm7
193     punpckhwd xmm6, xmm7
194     movdqa    xmm7, xmm6
195 
196         // Third round of bit swap.
197         // Write to the destination pointer.
198     movdqa    xmm6, xmm0
199     punpckldq xmm0, xmm4
200     punpckhdq xmm6, xmm4
201     movdqa    xmm4, xmm6
202     movdqu    xmm6, [esp]  // restore xmm6
203     movlpd    qword ptr [edx], xmm0
204     movhpd    qword ptr [ebx], xmm0
205     movlpd    qword ptr [edx + esi], xmm4
206     lea       edx, [edx + 2 * esi]
207     movhpd    qword ptr [ebx + ebp], xmm4
208     lea       ebx, [ebx + 2 * ebp]
209     movdqa    xmm0, xmm2  // use xmm0 as the temp register.
210     punpckldq xmm2, xmm6
211     movlpd    qword ptr [edx], xmm2
212     movhpd    qword ptr [ebx], xmm2
213     punpckhdq xmm0, xmm6
214     movlpd    qword ptr [edx + esi], xmm0
215     lea       edx, [edx + 2 * esi]
216     movhpd    qword ptr [ebx + ebp], xmm0
217     lea       ebx, [ebx + 2 * ebp]
218     movdqa    xmm0, xmm1  // use xmm0 as the temp register.
219     punpckldq xmm1, xmm5
220     movlpd    qword ptr [edx], xmm1
221     movhpd    qword ptr [ebx], xmm1
222     punpckhdq xmm0, xmm5
223     movlpd    qword ptr [edx + esi], xmm0
224     lea       edx, [edx + 2 * esi]
225     movhpd    qword ptr [ebx + ebp], xmm0
226     lea       ebx, [ebx + 2 * ebp]
227     movdqa    xmm0, xmm3  // use xmm0 as the temp register.
228     punpckldq xmm3, xmm7
229     movlpd    qword ptr [edx], xmm3
230     movhpd    qword ptr [ebx], xmm3
231     punpckhdq xmm0, xmm7
232     sub       ecx, 8
233     movlpd    qword ptr [edx + esi], xmm0
234     lea       edx, [edx + 2 * esi]
235     movhpd    qword ptr [ebx + ebp], xmm0
236     lea       ebx, [ebx + 2 * ebp]
237     jg        convertloop
238 
239     mov       esp, [esp + 16]
240     pop       ebp
241     pop       edi
242     pop       esi
243     pop       ebx
244     ret
245   }
246 }
247 
248 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
249 
250 #ifdef __cplusplus
251 }  // extern "C"
252 }  // namespace libyuv
253 #endif
254