1 /*
2 * Copyright (c) 2020, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file      cm_mem_os_sse4_impl.cpp
24 //! \brief     Contains CM memory function implementations
25 //!
26 
27 #include "cm_mem_os_sse4_impl.h"
28 
29 #if defined(__SSE4_1__)
30 
31 #include "cm_mem.h"
32 #include <mmintrin.h>
33 
CmFastMemCopyFromWC_SSE4(void * dst,const void * src,const size_t bytes)34 void CmFastMemCopyFromWC_SSE4( void* dst, const void* src, const size_t bytes )
35 {
36     // Cache pointers to memory
37     uint8_t *tempDst = (uint8_t*)dst;
38     uint8_t *tempSrc = (uint8_t*)src;
39 
40     size_t count = bytes;
41 
42     if( count >= CM_CPU_FASTCOPY_THRESHOLD )
43     {
44         //Streaming Load must be 16-byte aligned but should
45         //be 64-byte aligned for optimal performance
46         const size_t doubleHexWordAlignBytes =
47             GetAlignmentOffset( tempSrc, sizeof(DHWORD) );
48 
49         // Copy portion of the source memory that is not aligned
50         if( doubleHexWordAlignBytes )
51         {
52             CmSafeMemCopy( tempDst, tempSrc, doubleHexWordAlignBytes );
53 
54             tempDst += doubleHexWordAlignBytes;
55             tempSrc += doubleHexWordAlignBytes;
56             count -= doubleHexWordAlignBytes;
57         }
58 
59         CM_ASSERT( IsAligned( tempSrc, sizeof(DHWORD) ) == true );
60 
61         // Get the number of bytes to be copied (rounded down to nearets DHWORD)
62         const size_t doubleHexWordsToCopy = count / sizeof(DHWORD);
63 
64         if( doubleHexWordsToCopy )
65         {
66             // Determine if the destination address is aligned
67             const bool isDstDoubleQuadWordAligned =
68                 IsAligned( tempDst, sizeof(DQWORD) );
69 
70             __m128i* mmSrc = (__m128i*)(tempSrc);
71             __m128i* mmDst = reinterpret_cast<__m128i*>(tempDst);
72             __m128i  xmm0, xmm1, xmm2, xmm3;
73 
74             if( isDstDoubleQuadWordAligned )
75             {
76                 for( size_t i=0; i<doubleHexWordsToCopy; i++ )
77                 {
78                     // Sync the WC memory data before issuing the MOVNTDQA instruction.
79                     _mm_mfence();
80                     xmm0 = _mm_stream_load_si128(mmSrc);
81                     xmm1 = _mm_stream_load_si128(mmSrc + 1);
82                     xmm2 = _mm_stream_load_si128(mmSrc + 2);
83                     xmm3 = _mm_stream_load_si128(mmSrc + 3);
84                     mmSrc += 4;
85 
86                     _mm_store_si128(mmDst, xmm0);
87                     _mm_store_si128(mmDst + 1, xmm1);
88                     _mm_store_si128(mmDst + 2, xmm2);
89                     _mm_store_si128(mmDst + 3, xmm3);
90                     mmDst += 4;
91 
92                     tempDst += sizeof(DHWORD);
93                     tempSrc += sizeof(DHWORD);
94                     count -= sizeof(DHWORD);
95                 }
96             }
97             else
98             {
99                 for( size_t i=0; i<doubleHexWordsToCopy; i++ )
100                 {
101                     // Sync the WC memory data before issuing the MOVNTDQA instruction.
102                     _mm_mfence();
103                     xmm0 = _mm_stream_load_si128(mmSrc);
104                     xmm1 = _mm_stream_load_si128(mmSrc + 1);
105                     xmm2 = _mm_stream_load_si128(mmSrc + 2);
106                     xmm3 = _mm_stream_load_si128(mmSrc + 3);
107                     mmSrc += 4;
108 
109                     _mm_storeu_si128(mmDst, xmm0);
110                     _mm_storeu_si128(mmDst + 1, xmm1);
111                     _mm_storeu_si128(mmDst + 2, xmm2);
112                     _mm_storeu_si128(mmDst + 3, xmm3);
113                     mmDst += 4;
114                     tempDst += sizeof(DHWORD);
115                     tempSrc += sizeof(DHWORD);
116                     count -= sizeof(DHWORD);
117                 }
118             }
119         }
120     }
121 
122     // Copy remaining uint8_t(s)
123     if( count )
124     {
125         CmSafeMemCopy( tempDst, tempSrc, count );
126     }
127 }
128 
129 #endif // __SSE4_1__
130