1*4e366538SXin Li /*
2*4e366538SXin Li * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li *
4*4e366538SXin Li * Use of this source code is governed by a BSD-style license
5*4e366538SXin Li * that can be found in the LICENSE file in the root of the source
6*4e366538SXin Li * tree. An additional intellectual property rights grant can be found
7*4e366538SXin Li * in the file PATENTS. All contributing project authors may
8*4e366538SXin Li * be found in the AUTHORS file in the root of the source tree.
9*4e366538SXin Li */
10*4e366538SXin Li
11*4e366538SXin Li #include "libyuv/rotate_row.h"
12*4e366538SXin Li #include "libyuv/row.h"
13*4e366538SXin Li
14*4e366538SXin Li #ifdef __cplusplus
15*4e366538SXin Li namespace libyuv {
16*4e366538SXin Li extern "C" {
17*4e366538SXin Li #endif
18*4e366538SXin Li
19*4e366538SXin Li // This module is for 32 bit Visual C x86
20*4e366538SXin Li #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
21*4e366538SXin Li !defined(__clang__) && defined(_M_IX86)
22*4e366538SXin Li
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)23*4e366538SXin Li __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
24*4e366538SXin Li int src_stride,
25*4e366538SXin Li uint8_t* dst,
26*4e366538SXin Li int dst_stride,
27*4e366538SXin Li int width) {
28*4e366538SXin Li __asm {
29*4e366538SXin Li push edi
30*4e366538SXin Li push esi
31*4e366538SXin Li push ebp
32*4e366538SXin Li mov eax, [esp + 12 + 4] // src
33*4e366538SXin Li mov edi, [esp + 12 + 8] // src_stride
34*4e366538SXin Li mov edx, [esp + 12 + 12] // dst
35*4e366538SXin Li mov esi, [esp + 12 + 16] // dst_stride
36*4e366538SXin Li mov ecx, [esp + 12 + 20] // width
37*4e366538SXin Li
38*4e366538SXin Li // Read in the data from the source pointer.
39*4e366538SXin Li // First round of bit swap.
40*4e366538SXin Li align 4
41*4e366538SXin Li convertloop:
42*4e366538SXin Li movq xmm0, qword ptr [eax]
43*4e366538SXin Li lea ebp, [eax + 8]
44*4e366538SXin Li movq xmm1, qword ptr [eax + edi]
45*4e366538SXin Li lea eax, [eax + 2 * edi]
46*4e366538SXin Li punpcklbw xmm0, xmm1
47*4e366538SXin Li movq xmm2, qword ptr [eax]
48*4e366538SXin Li movdqa xmm1, xmm0
49*4e366538SXin Li palignr xmm1, xmm1, 8
50*4e366538SXin Li movq xmm3, qword ptr [eax + edi]
51*4e366538SXin Li lea eax, [eax + 2 * edi]
52*4e366538SXin Li punpcklbw xmm2, xmm3
53*4e366538SXin Li movdqa xmm3, xmm2
54*4e366538SXin Li movq xmm4, qword ptr [eax]
55*4e366538SXin Li palignr xmm3, xmm3, 8
56*4e366538SXin Li movq xmm5, qword ptr [eax + edi]
57*4e366538SXin Li punpcklbw xmm4, xmm5
58*4e366538SXin Li lea eax, [eax + 2 * edi]
59*4e366538SXin Li movdqa xmm5, xmm4
60*4e366538SXin Li movq xmm6, qword ptr [eax]
61*4e366538SXin Li palignr xmm5, xmm5, 8
62*4e366538SXin Li movq xmm7, qword ptr [eax + edi]
63*4e366538SXin Li punpcklbw xmm6, xmm7
64*4e366538SXin Li mov eax, ebp
65*4e366538SXin Li movdqa xmm7, xmm6
66*4e366538SXin Li palignr xmm7, xmm7, 8
67*4e366538SXin Li // Second round of bit swap.
68*4e366538SXin Li punpcklwd xmm0, xmm2
69*4e366538SXin Li punpcklwd xmm1, xmm3
70*4e366538SXin Li movdqa xmm2, xmm0
71*4e366538SXin Li movdqa xmm3, xmm1
72*4e366538SXin Li palignr xmm2, xmm2, 8
73*4e366538SXin Li palignr xmm3, xmm3, 8
74*4e366538SXin Li punpcklwd xmm4, xmm6
75*4e366538SXin Li punpcklwd xmm5, xmm7
76*4e366538SXin Li movdqa xmm6, xmm4
77*4e366538SXin Li movdqa xmm7, xmm5
78*4e366538SXin Li palignr xmm6, xmm6, 8
79*4e366538SXin Li palignr xmm7, xmm7, 8
80*4e366538SXin Li // Third round of bit swap.
81*4e366538SXin Li // Write to the destination pointer.
82*4e366538SXin Li punpckldq xmm0, xmm4
83*4e366538SXin Li movq qword ptr [edx], xmm0
84*4e366538SXin Li movdqa xmm4, xmm0
85*4e366538SXin Li palignr xmm4, xmm4, 8
86*4e366538SXin Li movq qword ptr [edx + esi], xmm4
87*4e366538SXin Li lea edx, [edx + 2 * esi]
88*4e366538SXin Li punpckldq xmm2, xmm6
89*4e366538SXin Li movdqa xmm6, xmm2
90*4e366538SXin Li palignr xmm6, xmm6, 8
91*4e366538SXin Li movq qword ptr [edx], xmm2
92*4e366538SXin Li punpckldq xmm1, xmm5
93*4e366538SXin Li movq qword ptr [edx + esi], xmm6
94*4e366538SXin Li lea edx, [edx + 2 * esi]
95*4e366538SXin Li movdqa xmm5, xmm1
96*4e366538SXin Li movq qword ptr [edx], xmm1
97*4e366538SXin Li palignr xmm5, xmm5, 8
98*4e366538SXin Li punpckldq xmm3, xmm7
99*4e366538SXin Li movq qword ptr [edx + esi], xmm5
100*4e366538SXin Li lea edx, [edx + 2 * esi]
101*4e366538SXin Li movq qword ptr [edx], xmm3
102*4e366538SXin Li movdqa xmm7, xmm3
103*4e366538SXin Li palignr xmm7, xmm7, 8
104*4e366538SXin Li sub ecx, 8
105*4e366538SXin Li movq qword ptr [edx + esi], xmm7
106*4e366538SXin Li lea edx, [edx + 2 * esi]
107*4e366538SXin Li jg convertloop
108*4e366538SXin Li
109*4e366538SXin Li pop ebp
110*4e366538SXin Li pop esi
111*4e366538SXin Li pop edi
112*4e366538SXin Li ret
113*4e366538SXin Li }
114*4e366538SXin Li }
115*4e366538SXin Li
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int w)116*4e366538SXin Li __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
117*4e366538SXin Li int src_stride,
118*4e366538SXin Li uint8_t* dst_a,
119*4e366538SXin Li int dst_stride_a,
120*4e366538SXin Li uint8_t* dst_b,
121*4e366538SXin Li int dst_stride_b,
122*4e366538SXin Li int w) {
123*4e366538SXin Li __asm {
124*4e366538SXin Li push ebx
125*4e366538SXin Li push esi
126*4e366538SXin Li push edi
127*4e366538SXin Li push ebp
128*4e366538SXin Li mov eax, [esp + 16 + 4] // src
129*4e366538SXin Li mov edi, [esp + 16 + 8] // src_stride
130*4e366538SXin Li mov edx, [esp + 16 + 12] // dst_a
131*4e366538SXin Li mov esi, [esp + 16 + 16] // dst_stride_a
132*4e366538SXin Li mov ebx, [esp + 16 + 20] // dst_b
133*4e366538SXin Li mov ebp, [esp + 16 + 24] // dst_stride_b
134*4e366538SXin Li mov ecx, esp
135*4e366538SXin Li sub esp, 4 + 16
136*4e366538SXin Li and esp, ~15
137*4e366538SXin Li mov [esp + 16], ecx
138*4e366538SXin Li mov ecx, [ecx + 16 + 28] // w
139*4e366538SXin Li
140*4e366538SXin Li align 4
141*4e366538SXin Li // Read in the data from the source pointer.
142*4e366538SXin Li // First round of bit swap.
143*4e366538SXin Li convertloop:
144*4e366538SXin Li movdqu xmm0, [eax]
145*4e366538SXin Li movdqu xmm1, [eax + edi]
146*4e366538SXin Li lea eax, [eax + 2 * edi]
147*4e366538SXin Li movdqa xmm7, xmm0 // use xmm7 as temp register.
148*4e366538SXin Li punpcklbw xmm0, xmm1
149*4e366538SXin Li punpckhbw xmm7, xmm1
150*4e366538SXin Li movdqa xmm1, xmm7
151*4e366538SXin Li movdqu xmm2, [eax]
152*4e366538SXin Li movdqu xmm3, [eax + edi]
153*4e366538SXin Li lea eax, [eax + 2 * edi]
154*4e366538SXin Li movdqa xmm7, xmm2
155*4e366538SXin Li punpcklbw xmm2, xmm3
156*4e366538SXin Li punpckhbw xmm7, xmm3
157*4e366538SXin Li movdqa xmm3, xmm7
158*4e366538SXin Li movdqu xmm4, [eax]
159*4e366538SXin Li movdqu xmm5, [eax + edi]
160*4e366538SXin Li lea eax, [eax + 2 * edi]
161*4e366538SXin Li movdqa xmm7, xmm4
162*4e366538SXin Li punpcklbw xmm4, xmm5
163*4e366538SXin Li punpckhbw xmm7, xmm5
164*4e366538SXin Li movdqa xmm5, xmm7
165*4e366538SXin Li movdqu xmm6, [eax]
166*4e366538SXin Li movdqu xmm7, [eax + edi]
167*4e366538SXin Li lea eax, [eax + 2 * edi]
168*4e366538SXin Li movdqu [esp], xmm5 // backup xmm5
169*4e366538SXin Li neg edi
170*4e366538SXin Li movdqa xmm5, xmm6 // use xmm5 as temp register.
171*4e366538SXin Li punpcklbw xmm6, xmm7
172*4e366538SXin Li punpckhbw xmm5, xmm7
173*4e366538SXin Li movdqa xmm7, xmm5
174*4e366538SXin Li lea eax, [eax + 8 * edi + 16]
175*4e366538SXin Li neg edi
176*4e366538SXin Li // Second round of bit swap.
177*4e366538SXin Li movdqa xmm5, xmm0
178*4e366538SXin Li punpcklwd xmm0, xmm2
179*4e366538SXin Li punpckhwd xmm5, xmm2
180*4e366538SXin Li movdqa xmm2, xmm5
181*4e366538SXin Li movdqa xmm5, xmm1
182*4e366538SXin Li punpcklwd xmm1, xmm3
183*4e366538SXin Li punpckhwd xmm5, xmm3
184*4e366538SXin Li movdqa xmm3, xmm5
185*4e366538SXin Li movdqa xmm5, xmm4
186*4e366538SXin Li punpcklwd xmm4, xmm6
187*4e366538SXin Li punpckhwd xmm5, xmm6
188*4e366538SXin Li movdqa xmm6, xmm5
189*4e366538SXin Li movdqu xmm5, [esp] // restore xmm5
190*4e366538SXin Li movdqu [esp], xmm6 // backup xmm6
191*4e366538SXin Li movdqa xmm6, xmm5 // use xmm6 as temp register.
192*4e366538SXin Li punpcklwd xmm5, xmm7
193*4e366538SXin Li punpckhwd xmm6, xmm7
194*4e366538SXin Li movdqa xmm7, xmm6
195*4e366538SXin Li
196*4e366538SXin Li // Third round of bit swap.
197*4e366538SXin Li // Write to the destination pointer.
198*4e366538SXin Li movdqa xmm6, xmm0
199*4e366538SXin Li punpckldq xmm0, xmm4
200*4e366538SXin Li punpckhdq xmm6, xmm4
201*4e366538SXin Li movdqa xmm4, xmm6
202*4e366538SXin Li movdqu xmm6, [esp] // restore xmm6
203*4e366538SXin Li movlpd qword ptr [edx], xmm0
204*4e366538SXin Li movhpd qword ptr [ebx], xmm0
205*4e366538SXin Li movlpd qword ptr [edx + esi], xmm4
206*4e366538SXin Li lea edx, [edx + 2 * esi]
207*4e366538SXin Li movhpd qword ptr [ebx + ebp], xmm4
208*4e366538SXin Li lea ebx, [ebx + 2 * ebp]
209*4e366538SXin Li movdqa xmm0, xmm2 // use xmm0 as the temp register.
210*4e366538SXin Li punpckldq xmm2, xmm6
211*4e366538SXin Li movlpd qword ptr [edx], xmm2
212*4e366538SXin Li movhpd qword ptr [ebx], xmm2
213*4e366538SXin Li punpckhdq xmm0, xmm6
214*4e366538SXin Li movlpd qword ptr [edx + esi], xmm0
215*4e366538SXin Li lea edx, [edx + 2 * esi]
216*4e366538SXin Li movhpd qword ptr [ebx + ebp], xmm0
217*4e366538SXin Li lea ebx, [ebx + 2 * ebp]
218*4e366538SXin Li movdqa xmm0, xmm1 // use xmm0 as the temp register.
219*4e366538SXin Li punpckldq xmm1, xmm5
220*4e366538SXin Li movlpd qword ptr [edx], xmm1
221*4e366538SXin Li movhpd qword ptr [ebx], xmm1
222*4e366538SXin Li punpckhdq xmm0, xmm5
223*4e366538SXin Li movlpd qword ptr [edx + esi], xmm0
224*4e366538SXin Li lea edx, [edx + 2 * esi]
225*4e366538SXin Li movhpd qword ptr [ebx + ebp], xmm0
226*4e366538SXin Li lea ebx, [ebx + 2 * ebp]
227*4e366538SXin Li movdqa xmm0, xmm3 // use xmm0 as the temp register.
228*4e366538SXin Li punpckldq xmm3, xmm7
229*4e366538SXin Li movlpd qword ptr [edx], xmm3
230*4e366538SXin Li movhpd qword ptr [ebx], xmm3
231*4e366538SXin Li punpckhdq xmm0, xmm7
232*4e366538SXin Li sub ecx, 8
233*4e366538SXin Li movlpd qword ptr [edx + esi], xmm0
234*4e366538SXin Li lea edx, [edx + 2 * esi]
235*4e366538SXin Li movhpd qword ptr [ebx + ebp], xmm0
236*4e366538SXin Li lea ebx, [ebx + 2 * ebp]
237*4e366538SXin Li jg convertloop
238*4e366538SXin Li
239*4e366538SXin Li mov esp, [esp + 16]
240*4e366538SXin Li pop ebp
241*4e366538SXin Li pop edi
242*4e366538SXin Li pop esi
243*4e366538SXin Li pop ebx
244*4e366538SXin Li ret
245*4e366538SXin Li }
246*4e366538SXin Li }
247*4e366538SXin Li
248*4e366538SXin Li #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
249*4e366538SXin Li
250*4e366538SXin Li #ifdef __cplusplus
251*4e366538SXin Li } // extern "C"
252*4e366538SXin Li } // namespace libyuv
253*4e366538SXin Li #endif
254