1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for 32 bit Visual C x86
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
21 !defined(__clang__) && defined(_M_IX86)
22
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)23 __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
24 int src_stride,
25 uint8_t* dst,
26 int dst_stride,
27 int width) {
28 __asm {
29 push edi
30 push esi
31 push ebp
32 mov eax, [esp + 12 + 4] // src
33 mov edi, [esp + 12 + 8] // src_stride
34 mov edx, [esp + 12 + 12] // dst
35 mov esi, [esp + 12 + 16] // dst_stride
36 mov ecx, [esp + 12 + 20] // width
37
38 // Read in the data from the source pointer.
39 // First round of bit swap.
40 align 4
41 convertloop:
42 movq xmm0, qword ptr [eax]
43 lea ebp, [eax + 8]
44 movq xmm1, qword ptr [eax + edi]
45 lea eax, [eax + 2 * edi]
46 punpcklbw xmm0, xmm1
47 movq xmm2, qword ptr [eax]
48 movdqa xmm1, xmm0
49 palignr xmm1, xmm1, 8
50 movq xmm3, qword ptr [eax + edi]
51 lea eax, [eax + 2 * edi]
52 punpcklbw xmm2, xmm3
53 movdqa xmm3, xmm2
54 movq xmm4, qword ptr [eax]
55 palignr xmm3, xmm3, 8
56 movq xmm5, qword ptr [eax + edi]
57 punpcklbw xmm4, xmm5
58 lea eax, [eax + 2 * edi]
59 movdqa xmm5, xmm4
60 movq xmm6, qword ptr [eax]
61 palignr xmm5, xmm5, 8
62 movq xmm7, qword ptr [eax + edi]
63 punpcklbw xmm6, xmm7
64 mov eax, ebp
65 movdqa xmm7, xmm6
66 palignr xmm7, xmm7, 8
67 // Second round of bit swap.
68 punpcklwd xmm0, xmm2
69 punpcklwd xmm1, xmm3
70 movdqa xmm2, xmm0
71 movdqa xmm3, xmm1
72 palignr xmm2, xmm2, 8
73 palignr xmm3, xmm3, 8
74 punpcklwd xmm4, xmm6
75 punpcklwd xmm5, xmm7
76 movdqa xmm6, xmm4
77 movdqa xmm7, xmm5
78 palignr xmm6, xmm6, 8
79 palignr xmm7, xmm7, 8
80 // Third round of bit swap.
81 // Write to the destination pointer.
82 punpckldq xmm0, xmm4
83 movq qword ptr [edx], xmm0
84 movdqa xmm4, xmm0
85 palignr xmm4, xmm4, 8
86 movq qword ptr [edx + esi], xmm4
87 lea edx, [edx + 2 * esi]
88 punpckldq xmm2, xmm6
89 movdqa xmm6, xmm2
90 palignr xmm6, xmm6, 8
91 movq qword ptr [edx], xmm2
92 punpckldq xmm1, xmm5
93 movq qword ptr [edx + esi], xmm6
94 lea edx, [edx + 2 * esi]
95 movdqa xmm5, xmm1
96 movq qword ptr [edx], xmm1
97 palignr xmm5, xmm5, 8
98 punpckldq xmm3, xmm7
99 movq qword ptr [edx + esi], xmm5
100 lea edx, [edx + 2 * esi]
101 movq qword ptr [edx], xmm3
102 movdqa xmm7, xmm3
103 palignr xmm7, xmm7, 8
104 sub ecx, 8
105 movq qword ptr [edx + esi], xmm7
106 lea edx, [edx + 2 * esi]
107 jg convertloop
108
109 pop ebp
110 pop esi
111 pop edi
112 ret
113 }
114 }
115
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int w)116 __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
117 int src_stride,
118 uint8_t* dst_a,
119 int dst_stride_a,
120 uint8_t* dst_b,
121 int dst_stride_b,
122 int w) {
123 __asm {
124 push ebx
125 push esi
126 push edi
127 push ebp
128 mov eax, [esp + 16 + 4] // src
129 mov edi, [esp + 16 + 8] // src_stride
130 mov edx, [esp + 16 + 12] // dst_a
131 mov esi, [esp + 16 + 16] // dst_stride_a
132 mov ebx, [esp + 16 + 20] // dst_b
133 mov ebp, [esp + 16 + 24] // dst_stride_b
134 mov ecx, esp
135 sub esp, 4 + 16
136 and esp, ~15
137 mov [esp + 16], ecx
138 mov ecx, [ecx + 16 + 28] // w
139
140 align 4
141 // Read in the data from the source pointer.
142 // First round of bit swap.
143 convertloop:
144 movdqu xmm0, [eax]
145 movdqu xmm1, [eax + edi]
146 lea eax, [eax + 2 * edi]
147 movdqa xmm7, xmm0 // use xmm7 as temp register.
148 punpcklbw xmm0, xmm1
149 punpckhbw xmm7, xmm1
150 movdqa xmm1, xmm7
151 movdqu xmm2, [eax]
152 movdqu xmm3, [eax + edi]
153 lea eax, [eax + 2 * edi]
154 movdqa xmm7, xmm2
155 punpcklbw xmm2, xmm3
156 punpckhbw xmm7, xmm3
157 movdqa xmm3, xmm7
158 movdqu xmm4, [eax]
159 movdqu xmm5, [eax + edi]
160 lea eax, [eax + 2 * edi]
161 movdqa xmm7, xmm4
162 punpcklbw xmm4, xmm5
163 punpckhbw xmm7, xmm5
164 movdqa xmm5, xmm7
165 movdqu xmm6, [eax]
166 movdqu xmm7, [eax + edi]
167 lea eax, [eax + 2 * edi]
168 movdqu [esp], xmm5 // backup xmm5
169 neg edi
170 movdqa xmm5, xmm6 // use xmm5 as temp register.
171 punpcklbw xmm6, xmm7
172 punpckhbw xmm5, xmm7
173 movdqa xmm7, xmm5
174 lea eax, [eax + 8 * edi + 16]
175 neg edi
176 // Second round of bit swap.
177 movdqa xmm5, xmm0
178 punpcklwd xmm0, xmm2
179 punpckhwd xmm5, xmm2
180 movdqa xmm2, xmm5
181 movdqa xmm5, xmm1
182 punpcklwd xmm1, xmm3
183 punpckhwd xmm5, xmm3
184 movdqa xmm3, xmm5
185 movdqa xmm5, xmm4
186 punpcklwd xmm4, xmm6
187 punpckhwd xmm5, xmm6
188 movdqa xmm6, xmm5
189 movdqu xmm5, [esp] // restore xmm5
190 movdqu [esp], xmm6 // backup xmm6
191 movdqa xmm6, xmm5 // use xmm6 as temp register.
192 punpcklwd xmm5, xmm7
193 punpckhwd xmm6, xmm7
194 movdqa xmm7, xmm6
195
196 // Third round of bit swap.
197 // Write to the destination pointer.
198 movdqa xmm6, xmm0
199 punpckldq xmm0, xmm4
200 punpckhdq xmm6, xmm4
201 movdqa xmm4, xmm6
202 movdqu xmm6, [esp] // restore xmm6
203 movlpd qword ptr [edx], xmm0
204 movhpd qword ptr [ebx], xmm0
205 movlpd qword ptr [edx + esi], xmm4
206 lea edx, [edx + 2 * esi]
207 movhpd qword ptr [ebx + ebp], xmm4
208 lea ebx, [ebx + 2 * ebp]
209 movdqa xmm0, xmm2 // use xmm0 as the temp register.
210 punpckldq xmm2, xmm6
211 movlpd qword ptr [edx], xmm2
212 movhpd qword ptr [ebx], xmm2
213 punpckhdq xmm0, xmm6
214 movlpd qword ptr [edx + esi], xmm0
215 lea edx, [edx + 2 * esi]
216 movhpd qword ptr [ebx + ebp], xmm0
217 lea ebx, [ebx + 2 * ebp]
218 movdqa xmm0, xmm1 // use xmm0 as the temp register.
219 punpckldq xmm1, xmm5
220 movlpd qword ptr [edx], xmm1
221 movhpd qword ptr [ebx], xmm1
222 punpckhdq xmm0, xmm5
223 movlpd qword ptr [edx + esi], xmm0
224 lea edx, [edx + 2 * esi]
225 movhpd qword ptr [ebx + ebp], xmm0
226 lea ebx, [ebx + 2 * ebp]
227 movdqa xmm0, xmm3 // use xmm0 as the temp register.
228 punpckldq xmm3, xmm7
229 movlpd qword ptr [edx], xmm3
230 movhpd qword ptr [ebx], xmm3
231 punpckhdq xmm0, xmm7
232 sub ecx, 8
233 movlpd qword ptr [edx + esi], xmm0
234 lea edx, [edx + 2 * esi]
235 movhpd qword ptr [ebx + ebp], xmm0
236 lea ebx, [ebx + 2 * ebp]
237 jg convertloop
238
239 mov esp, [esp + 16]
240 pop ebp
241 pop edi
242 pop esi
243 pop ebx
244 ret
245 }
246 }
247
248 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
249
250 #ifdef __cplusplus
251 } // extern "C"
252 } // namespace libyuv
253 #endif
254