xref: /aosp_15_r20/external/intel-media-driver/media_driver/agnostic/common/cm/cm_mem_sse2_impl.cpp (revision ba62d9d3abf0e404f2022b4cd7a85e107f48596f)
1 /*
2 * Copyright (c) 2020, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file      cm_mem_sse2_impl.cpp
24 //! \brief     Contains CM memory function implementations
25 //!
26 
27 #include "cm_mem.h"
28 #include "cm_mem_sse2_impl.h"
29 
30 #if defined(__SSE2__) || !(defined(LINUX) || defined(ANDROID))
31 
32 #include <mmintrin.h>
33 
34 // GCC>=11 gives a warning when dividing sizeof(array) with sizeof(another_type)
35 // in case division was mistyped and meant to calculate array size.
36 // As that's not the case here, use extra divisor parenthesis to suppress
37 // the warning.
38 #define DQWORD_PER_PREFETCH(P) ( sizeof(P)/(sizeof(DQWORD)) )
39 
FastMemCopy_SSE2_movntdq_movdqa(void * dst,void * src,const size_t doubleQuadWords)40 void FastMemCopy_SSE2_movntdq_movdqa(
41     void* dst,
42     void* src,
43     const size_t doubleQuadWords )
44 {
45     CM_ASSERT( IsAligned( dst, sizeof(DQWORD) ) );
46     CM_ASSERT( IsAligned( src, sizeof(DQWORD) ) );
47 
48 
49     const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
50 
51     // Prefetch the src data
52     Prefetch( (uint8_t*)src );
53     Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
54 
55     // Convert to SSE2 registers
56     __m128i* dst128i = (__m128i*)dst;
57     __m128i* src128i = (__m128i*)src;
58 
59     size_t count = doubleQuadWords;
60 
61     // Copies a cacheline per loop iteration
62     while( count >= doubleQuadWordsPerPrefetch )
63     {
64         Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
65 
66         count -= doubleQuadWordsPerPrefetch;
67 
68         // Copy cacheline of data
69         for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
70         {
71             _mm_stream_si128( dst128i++,
72                 _mm_load_si128( src128i++ ) );
73         }
74     }
75 
76     // Copy DQWORD if not cacheline multiple
77     while( count-- )
78     {
79         _mm_stream_si128( dst128i++,
80             _mm_load_si128( src128i++ ) );
81     }
82 }
83 
FastMemCopy_SSE2_movdqu_movdqa(void * dst,void * src,const size_t doubleQuadWords)84 void FastMemCopy_SSE2_movdqu_movdqa(
85     void* dst,
86     void* src,
87     const size_t doubleQuadWords )
88 {
89     CM_ASSERT( IsAligned( src, sizeof(DQWORD) ) );
90 
91     const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
92 
93     // Prefetch the src data
94     Prefetch( (uint8_t*)src );
95     Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
96 
97     // Convert to SSE2 registers
98     __m128i* dst128i = (__m128i*)dst;
99     __m128i* src128i = (__m128i*)src;
100 
101     size_t count = doubleQuadWords;
102 
103     // Copies a cacheline per loop iteration
104     while( count >= doubleQuadWordsPerPrefetch )
105     {
106         Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
107 
108         count -= doubleQuadWordsPerPrefetch;
109 
110         // Copy cacheline of data
111         for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
112         {
113             _mm_storeu_si128( dst128i++,
114                 _mm_load_si128( src128i++ ) );
115         }
116     }
117 
118     // Copy DQWORD if not cacheline multiple
119     while( count-- )
120     {
121         _mm_storeu_si128( dst128i++,
122             _mm_load_si128( src128i++ ) );
123     }
124 }
125 
FastMemCopy_SSE2_movntdq_movdqu(void * dst,const void * src,const size_t doubleQuadWords)126 void FastMemCopy_SSE2_movntdq_movdqu(
127     void* dst,
128     const void* src,
129     const size_t doubleQuadWords )
130 {
131     CM_ASSERT( IsAligned( dst, sizeof(DQWORD) ) );
132 
133     const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
134 
135     // Prefetch the src data
136     Prefetch( (uint8_t*)src );
137     Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
138 
139     // Convert to SSE2 registers
140     __m128i* dst128i = (__m128i*)dst;
141     __m128i* src128i = (__m128i*)src;
142 
143     size_t count = doubleQuadWords;
144 
145     // Copies a cacheline per loop iteration
146     while( count >= doubleQuadWordsPerPrefetch )
147     {
148         Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
149 
150         count -= doubleQuadWordsPerPrefetch;
151 
152         // Copy cacheline of data
153         for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
154         {
155             _mm_stream_si128( dst128i++,
156                 _mm_loadu_si128( src128i++ ) );
157         }
158     }
159 
160     // Copy DQWORD if not cacheline multiple
161     while( count-- )
162     {
163         _mm_stream_si128( dst128i++,
164             _mm_loadu_si128( src128i++ ) );
165     }
166 }
167 
FastMemCopy_SSE2_movdqu_movdqu(void * dst,const void * src,const size_t doubleQuadWords)168 void FastMemCopy_SSE2_movdqu_movdqu(
169     void* dst,
170     const void* src,
171     const size_t doubleQuadWords )
172 {
173     const size_t doubleQuadWordsPerPrefetch = DQWORD_PER_PREFETCH(PREFETCH);
174 
175     // Prefetch the src data
176     Prefetch( (uint8_t*)src );
177     Prefetch( (uint8_t*)src + sizeof(PREFETCH) );
178 
179     // Convert to SSE2 registers
180     __m128i* dst128i = (__m128i*)dst;
181     __m128i* src128i = (__m128i*)src;
182 
183     size_t count = doubleQuadWords;
184 
185     // Copies a cacheline per loop iteration
186     while( count >= doubleQuadWordsPerPrefetch )
187     {
188         Prefetch( (uint8_t*)src128i + 2 * sizeof(PREFETCH) );
189 
190         count -= doubleQuadWordsPerPrefetch;
191 
192         // Copy cacheline of data
193         for( size_t i = 0; i < doubleQuadWordsPerPrefetch; i++ )
194         {
195             _mm_storeu_si128( dst128i++,
196                 _mm_loadu_si128( src128i++ ) );
197         }
198     }
199 
200     // Copy DQWORD if not cacheline multiple
201     while( count-- )
202     {
203         _mm_storeu_si128( dst128i++,
204             _mm_loadu_si128( src128i++ ) );
205     }
206 }
207 
FastMemCopy_SSE2(void * dst,void * src,const size_t doubleQuadWords)208 void FastMemCopy_SSE2(
209     void* dst,
210     void* src,
211     const size_t doubleQuadWords )
212 {
213     // Determine if the source and destination addresses are 128-bit aligned
214     const bool isDstDoubleQuadWordAligned = IsAligned( dst, sizeof(DQWORD) );
215     const bool isSrcDoubleQuadWordAligned = IsAligned( src, sizeof(DQWORD) );
216 
217     if( isSrcDoubleQuadWordAligned && isDstDoubleQuadWordAligned )
218     {
219         FastMemCopy_SSE2_movntdq_movdqa( dst, src, doubleQuadWords );
220     }
221     else if( isDstDoubleQuadWordAligned )
222     {
223         FastMemCopy_SSE2_movntdq_movdqu( dst, src, doubleQuadWords );
224     }
225     else if( isSrcDoubleQuadWordAligned )
226     {
227         FastMemCopy_SSE2_movdqu_movdqa( dst, src, doubleQuadWords );
228     }
229     else // if( !isSrcDoubleQuadWordAligned && !isDstDoubleQuadWordAligned )
230     {
231         FastMemCopy_SSE2_movdqu_movdqu( dst, src, doubleQuadWords );
232     }
233 }
234 
CmFastMemCopy_SSE2(void * dst,const void * src,const size_t bytes)235 void CmFastMemCopy_SSE2( void* dst, const void* src, const size_t bytes )
236 {
237     // Cache pointers to memory
238     uint8_t *cacheDst = (uint8_t*)dst;
239     uint8_t *cacheSrc = (uint8_t*)src;
240 
241     size_t count = bytes;
242 
243     // Get the number of DQWORDs to be copied
244     const size_t doubleQuadWords = count / sizeof(DQWORD);
245 
246     if( count >= CM_CPU_FASTCOPY_THRESHOLD && doubleQuadWords )
247     {
248         FastMemCopy_SSE2( cacheDst, cacheSrc, doubleQuadWords );
249 
250         cacheDst += doubleQuadWords * sizeof(DQWORD);
251         cacheSrc += doubleQuadWords * sizeof(DQWORD);
252         count -= doubleQuadWords * sizeof(DQWORD);
253     }
254 
255     // Copy remaining uint8_t(s)
256     if( count )
257     {
258         MOS_SecureMemcpy( cacheDst, count, cacheSrc, count );
259     }
260 }
261 
CmFastMemCopyWC_SSE2(void * dst,const void * src,const size_t bytes)262 void CmFastMemCopyWC_SSE2( void* dst, const void* src, const size_t bytes )
263 {
264   // Cache pointers to memory
265   uint8_t *cacheDst = (uint8_t*)dst;
266   uint8_t *cacheSrc = (uint8_t*)src;
267 
268   size_t count = bytes;
269 
270   if( count >= CM_CPU_FASTCOPY_THRESHOLD )
271   {
272     const size_t doubleQuadwordAlignBytes =
273       GetAlignmentOffset( cacheDst, sizeof(DQWORD) );
274 
275     // The destination pointer should be 128-bit aligned
276     if( doubleQuadwordAlignBytes )
277     {
278       MOS_SecureMemcpy( cacheDst, doubleQuadwordAlignBytes,cacheSrc, doubleQuadwordAlignBytes );
279 
280       cacheDst += doubleQuadwordAlignBytes;
281       cacheSrc += doubleQuadwordAlignBytes;
282       count -= doubleQuadwordAlignBytes;
283     }
284 
285     // Get the number of DQWORDs to be copied
286     const size_t doubleQuadWords = count / sizeof(DQWORD);
287 
288     if( doubleQuadWords && count >= sizeof(PREFETCH))
289     {
290       // Determine if the source and destination addresses are
291       // 128-bit aligned
292       CM_ASSERT( IsAligned( cacheDst, sizeof(DQWORD) ) );
293 
294       const bool isSrcDoubleQuadWordAligned =
295         IsAligned( cacheSrc, sizeof(DQWORD) );
296 
297       if( isSrcDoubleQuadWordAligned )
298       {
299         FastMemCopy_SSE2_movntdq_movdqa( cacheDst, cacheSrc,
300           doubleQuadWords );
301       }
302       else
303       {
304         FastMemCopy_SSE2_movntdq_movdqu( cacheDst, cacheSrc,
305           doubleQuadWords );
306       }
307 
308       cacheDst += doubleQuadWords * sizeof(DQWORD);
309       cacheSrc += doubleQuadWords * sizeof(DQWORD);
310       count -= doubleQuadWords * sizeof(DQWORD);
311     }
312   }
313 
314   // Copy remaining uint8_t(s)
315   if( count )
316   {
317     MOS_SecureMemcpy( cacheDst, count, cacheSrc, count );
318   }
319 }
320 
321 #endif // __SSE2__ || !(LINUX || ANDROID)
322