1 /*
2 * Copyright (c) 2020, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file cm_mem_sse2_impl.cpp
24 //! \brief Contains CM memory function implementations
25 //!
26
27 #include "cm_mem.h"
28 #include "cm_mem_sse2_impl.h"
29
30 #if defined(__SSE2__) || !(defined(LINUX) || defined(ANDROID))
31
32 #include <mmintrin.h>
33
34 // GCC>=11 gives a warning when dividing sizeof(array) with sizeof(another_type)
35 // in case division was mistyped and meant to calculate array size.
36 // As that's not the case here, use extra divisor parenthesis to suppress
37 // the warning.
38 #define DQWORD_PER_PREFETCH(P) ( sizeof(P)/(sizeof(DQWORD)) )
39
FastMemCopy_SSE2_movntdq_movdqa(void * dst,void * src,const size_t doubleQuadWords)40 void FastMemCopy_SSE2_movntdq_movdqa(
41 void* dst,
42 void* src,
43 const size_t doubleQuadWords )
44 {
45 CM_ASSERT( IsAligned( dst, sizeof(DQWORD) ) );
46 CM_ASSERT( IsAligned( src, sizeof(DQWORD) ) );
47
48
49 const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
50
51 // Prefetch the src data
52 Prefetch( (uint8_t*)src );
53 Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
54
55 // Convert to SSE2 registers
56 __m128i* dst128i = (__m128i*)dst;
57 __m128i* src128i = (__m128i*)src;
58
59 size_t count = doubleQuadWords;
60
61 // Copies a cacheline per loop iteration
62 while( count >= doubleQuadWordsPerPrefetch )
63 {
64 Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
65
66 count -= doubleQuadWordsPerPrefetch;
67
68 // Copy cacheline of data
69 for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
70 {
71 _mm_stream_si128( dst128i++,
72 _mm_load_si128( src128i++ ) );
73 }
74 }
75
76 // Copy DQWORD if not cacheline multiple
77 while( count-- )
78 {
79 _mm_stream_si128( dst128i++,
80 _mm_load_si128( src128i++ ) );
81 }
82 }
83
FastMemCopy_SSE2_movdqu_movdqa(void * dst,void * src,const size_t doubleQuadWords)84 void FastMemCopy_SSE2_movdqu_movdqa(
85 void* dst,
86 void* src,
87 const size_t doubleQuadWords )
88 {
89 CM_ASSERT( IsAligned( src, sizeof(DQWORD) ) );
90
91 const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
92
93 // Prefetch the src data
94 Prefetch( (uint8_t*)src );
95 Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
96
97 // Convert to SSE2 registers
98 __m128i* dst128i = (__m128i*)dst;
99 __m128i* src128i = (__m128i*)src;
100
101 size_t count = doubleQuadWords;
102
103 // Copies a cacheline per loop iteration
104 while( count >= doubleQuadWordsPerPrefetch )
105 {
106 Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
107
108 count -= doubleQuadWordsPerPrefetch;
109
110 // Copy cacheline of data
111 for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
112 {
113 _mm_storeu_si128( dst128i++,
114 _mm_load_si128( src128i++ ) );
115 }
116 }
117
118 // Copy DQWORD if not cacheline multiple
119 while( count-- )
120 {
121 _mm_storeu_si128( dst128i++,
122 _mm_load_si128( src128i++ ) );
123 }
124 }
125
FastMemCopy_SSE2_movntdq_movdqu(void * dst,const void * src,const size_t doubleQuadWords)126 void FastMemCopy_SSE2_movntdq_movdqu(
127 void* dst,
128 const void* src,
129 const size_t doubleQuadWords )
130 {
131 CM_ASSERT( IsAligned( dst, sizeof(DQWORD) ) );
132
133 const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
134
135 // Prefetch the src data
136 Prefetch( (uint8_t*)src );
137 Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
138
139 // Convert to SSE2 registers
140 __m128i* dst128i = (__m128i*)dst;
141 __m128i* src128i = (__m128i*)src;
142
143 size_t count = doubleQuadWords;
144
145 // Copies a cacheline per loop iteration
146 while( count >= doubleQuadWordsPerPrefetch )
147 {
148 Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
149
150 count -= doubleQuadWordsPerPrefetch;
151
152 // Copy cacheline of data
153 for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
154 {
155 _mm_stream_si128( dst128i++,
156 _mm_loadu_si128( src128i++ ) );
157 }
158 }
159
160 // Copy DQWORD if not cacheline multiple
161 while( count-- )
162 {
163 _mm_stream_si128( dst128i++,
164 _mm_loadu_si128( src128i++ ) );
165 }
166 }
167
FastMemCopy_SSE2_movdqu_movdqu(void * dst,const void * src,const size_t doubleQuadWords)168 void FastMemCopy_SSE2_movdqu_movdqu(
169 void* dst,
170 const void* src,
171 const size_t doubleQuadWords )
172 {
173 const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
174
175 // Prefetch the src data
176 Prefetch( (uint8_t*)src );
177 Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
178
179 // Convert to SSE2 registers
180 __m128i* dst128i = (__m128i*)dst;
181 __m128i* src128i = (__m128i*)src;
182
183 size_t count = doubleQuadWords;
184
185 // Copies a cacheline per loop iteration
186 while( count >= doubleQuadWordsPerPrefetch )
187 {
188 Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
189
190 count -= doubleQuadWordsPerPrefetch;
191
192 // Copy cacheline of data
193 for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
194 {
195 _mm_storeu_si128( dst128i++,
196 _mm_loadu_si128( src128i++ ) );
197 }
198 }
199
200 // Copy DQWORD if not cacheline multiple
201 while( count-- )
202 {
203 _mm_storeu_si128( dst128i++,
204 _mm_loadu_si128( src128i++ ) );
205 }
206 }
207
FastMemCopy_SSE2(void * dst,void * src,const size_t doubleQuadWords)208 void FastMemCopy_SSE2(
209 void* dst,
210 void* src,
211 const size_t doubleQuadWords )
212 {
213 // Determine if the source and destination addresses are 128-bit aligned
214 const bool isDstDoubleQuadWordAligned = IsAligned( dst, sizeof(DQWORD) );
215 const bool isSrcDoubleQuadWordAligned = IsAligned( src, sizeof(DQWORD) );
216
217 if( isSrcDoubleQuadWordAligned && isDstDoubleQuadWordAligned )
218 {
219 FastMemCopy_SSE2_movntdq_movdqa( dst, src, doubleQuadWords );
220 }
221 else if( isDstDoubleQuadWordAligned )
222 {
223 FastMemCopy_SSE2_movntdq_movdqu( dst, src, doubleQuadWords );
224 }
225 else if( isSrcDoubleQuadWordAligned )
226 {
227 FastMemCopy_SSE2_movdqu_movdqa( dst, src, doubleQuadWords );
228 }
229 else // if( !isSrcDoubleQuadWordAligned && !isDstDoubleQuadWordAligned )
230 {
231 FastMemCopy_SSE2_movdqu_movdqu( dst, src, doubleQuadWords );
232 }
233 }
234
CmFastMemCopy_SSE2(void * dst,const void * src,const size_t bytes)235 void CmFastMemCopy_SSE2( void* dst, const void* src, const size_t bytes )
236 {
237 // Cache pointers to memory
238 uint8_t *cacheDst = (uint8_t*)dst;
239 uint8_t *cacheSrc = (uint8_t*)src;
240
241 size_t count = bytes;
242
243 // Get the number of DQWORDs to be copied
244 const size_t doubleQuadWords = count / sizeof(DQWORD);
245
246 if( count >= CM_CPU_FASTCOPY_THRESHOLD && doubleQuadWords )
247 {
248 FastMemCopy_SSE2( cacheDst, cacheSrc, doubleQuadWords );
249
250 cacheDst += doubleQuadWords * sizeof(DQWORD);
251 cacheSrc += doubleQuadWords * sizeof(DQWORD);
252 count -= doubleQuadWords * sizeof(DQWORD);
253 }
254
255 // Copy remaining uint8_t(s)
256 if( count )
257 {
258 MOS_SecureMemcpy( cacheDst, count, cacheSrc, count );
259 }
260 }
261
CmFastMemCopyWC_SSE2(void * dst,const void * src,const size_t bytes)262 void CmFastMemCopyWC_SSE2( void* dst, const void* src, const size_t bytes )
263 {
264 // Cache pointers to memory
265 uint8_t *cacheDst = (uint8_t*)dst;
266 uint8_t *cacheSrc = (uint8_t*)src;
267
268 size_t count = bytes;
269
270 if( count >= CM_CPU_FASTCOPY_THRESHOLD )
271 {
272 const size_t doubleQuadwordAlignBytes =
273 GetAlignmentOffset( cacheDst, sizeof(DQWORD) );
274
275 // The destination pointer should be 128-bit aligned
276 if( doubleQuadwordAlignBytes )
277 {
278 MOS_SecureMemcpy( cacheDst, doubleQuadwordAlignBytes,cacheSrc, doubleQuadwordAlignBytes );
279
280 cacheDst += doubleQuadwordAlignBytes;
281 cacheSrc += doubleQuadwordAlignBytes;
282 count -= doubleQuadwordAlignBytes;
283 }
284
285 // Get the number of DQWORDs to be copied
286 const size_t doubleQuadWords = count / sizeof(DQWORD);
287
288 if( doubleQuadWords && count >= sizeof(PREFETCH))
289 {
290 // Determine if the source and destination addresses are
291 // 128-bit aligned
292 CM_ASSERT( IsAligned( cacheDst, sizeof(DQWORD) ) );
293
294 const bool isSrcDoubleQuadWordAligned =
295 IsAligned( cacheSrc, sizeof(DQWORD) );
296
297 if( isSrcDoubleQuadWordAligned )
298 {
299 FastMemCopy_SSE2_movntdq_movdqa( cacheDst, cacheSrc,
300 doubleQuadWords );
301 }
302 else
303 {
304 FastMemCopy_SSE2_movntdq_movdqu( cacheDst, cacheSrc,
305 doubleQuadWords );
306 }
307
308 cacheDst += doubleQuadWords * sizeof(DQWORD);
309 cacheSrc += doubleQuadWords * sizeof(DQWORD);
310 count -= doubleQuadWords * sizeof(DQWORD);
311 }
312 }
313
314 // Copy remaining uint8_t(s)
315 if( count )
316 {
317 MOS_SecureMemcpy( cacheDst, count, cacheSrc, count );
318 }
319 }
320
321 #endif // __SSE2__ || !(LINUX || ANDROID)
322