1*09537850SAkhilesh Sanikop/* 2*09537850SAkhilesh Sanikop * Copyright 2021 The libgav1 Authors 3*09537850SAkhilesh Sanikop * 4*09537850SAkhilesh Sanikop * Licensed under the Apache License, Version 2.0 (the "License"); 5*09537850SAkhilesh Sanikop * you may not use this file except in compliance with the License. 6*09537850SAkhilesh Sanikop * You may obtain a copy of the License at 7*09537850SAkhilesh Sanikop * 8*09537850SAkhilesh Sanikop * http://www.apache.org/licenses/LICENSE-2.0 9*09537850SAkhilesh Sanikop * 10*09537850SAkhilesh Sanikop * Unless required by applicable law or agreed to in writing, software 11*09537850SAkhilesh Sanikop * distributed under the License is distributed on an "AS IS" BASIS, 12*09537850SAkhilesh Sanikop * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*09537850SAkhilesh Sanikop * See the License for the specific language governing permissions and 14*09537850SAkhilesh Sanikop * limitations under the License. 15*09537850SAkhilesh Sanikop */ 16*09537850SAkhilesh Sanikop 17*09537850SAkhilesh Sanikop//------------------------------------------------------------------------------ 18*09537850SAkhilesh Sanikop// Compatibility functions. 19*09537850SAkhilesh Sanikop 20*09537850SAkhilesh Sanikopinline __m256i SetrM128i(const __m128i lo, const __m128i hi) { 21*09537850SAkhilesh Sanikop // For compatibility with older gcc toolchains (< 8) use 22*09537850SAkhilesh Sanikop // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations 23*09537850SAkhilesh Sanikop // are implemented similarly to the following, clang uses a different method 24*09537850SAkhilesh Sanikop // but no differences in assembly have been observed. 25*09537850SAkhilesh Sanikop return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); 26*09537850SAkhilesh Sanikop} 27*09537850SAkhilesh Sanikop 28*09537850SAkhilesh Sanikop//------------------------------------------------------------------------------ 29*09537850SAkhilesh Sanikop// Load functions. 30*09537850SAkhilesh Sanikop 31*09537850SAkhilesh Sanikopinline __m256i LoadAligned32(const void* a) { 32*09537850SAkhilesh Sanikop assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 33*09537850SAkhilesh Sanikop return _mm256_load_si256(static_cast<const __m256i*>(a)); 34*09537850SAkhilesh Sanikop} 35*09537850SAkhilesh Sanikop 36*09537850SAkhilesh Sanikopinline void LoadAligned64(const void* a, __m256i dst[2]) { 37*09537850SAkhilesh Sanikop assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 38*09537850SAkhilesh Sanikop dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0); 39*09537850SAkhilesh Sanikop dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1); 40*09537850SAkhilesh Sanikop} 41*09537850SAkhilesh Sanikop 42*09537850SAkhilesh Sanikopinline __m256i LoadUnaligned32(const void* a) { 43*09537850SAkhilesh Sanikop return _mm256_loadu_si256(static_cast<const __m256i*>(a)); 44*09537850SAkhilesh Sanikop} 45*09537850SAkhilesh Sanikop 46*09537850SAkhilesh Sanikop//------------------------------------------------------------------------------ 47*09537850SAkhilesh Sanikop// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. 48*09537850SAkhilesh Sanikop 49*09537850SAkhilesh Sanikopinline __m256i MaskOverreads(const __m256i source, 50*09537850SAkhilesh Sanikop const ptrdiff_t over_read_in_bytes) { 51*09537850SAkhilesh Sanikop __m256i dst = source; 52*09537850SAkhilesh Sanikop#if LIBGAV1_MSAN 53*09537850SAkhilesh Sanikop if (over_read_in_bytes >= 32) return _mm256_setzero_si256(); 54*09537850SAkhilesh Sanikop if (over_read_in_bytes > 0) { 55*09537850SAkhilesh Sanikop __m128i m = _mm_set1_epi8(-1); 56*09537850SAkhilesh Sanikop for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) { 57*09537850SAkhilesh Sanikop m = _mm_srli_si128(m, 1); 58*09537850SAkhilesh Sanikop } 59*09537850SAkhilesh Sanikop const __m256i mask = (over_read_in_bytes < 16) 60*09537850SAkhilesh Sanikop ? SetrM128i(_mm_set1_epi8(-1), m) 61*09537850SAkhilesh Sanikop : SetrM128i(m, _mm_setzero_si128()); 62*09537850SAkhilesh Sanikop dst = _mm256_and_si256(dst, mask); 63*09537850SAkhilesh Sanikop } 64*09537850SAkhilesh Sanikop#else 65*09537850SAkhilesh Sanikop static_cast<void>(over_read_in_bytes); 66*09537850SAkhilesh Sanikop#endif 67*09537850SAkhilesh Sanikop return dst; 68*09537850SAkhilesh Sanikop} 69*09537850SAkhilesh Sanikop 70*09537850SAkhilesh Sanikopinline __m256i LoadAligned32Msan(const void* const source, 71*09537850SAkhilesh Sanikop const ptrdiff_t over_read_in_bytes) { 72*09537850SAkhilesh Sanikop return MaskOverreads(LoadAligned32(source), over_read_in_bytes); 73*09537850SAkhilesh Sanikop} 74*09537850SAkhilesh Sanikop 75*09537850SAkhilesh Sanikopinline void LoadAligned64Msan(const void* const source, 76*09537850SAkhilesh Sanikop const ptrdiff_t over_read_in_bytes, 77*09537850SAkhilesh Sanikop __m256i dst[2]) { 78*09537850SAkhilesh Sanikop dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes); 79*09537850SAkhilesh Sanikop dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1), 80*09537850SAkhilesh Sanikop over_read_in_bytes); 81*09537850SAkhilesh Sanikop} 82*09537850SAkhilesh Sanikop 83*09537850SAkhilesh Sanikopinline __m256i LoadUnaligned32Msan(const void* const source, 84*09537850SAkhilesh Sanikop const ptrdiff_t over_read_in_bytes) { 85*09537850SAkhilesh Sanikop return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes); 86*09537850SAkhilesh Sanikop} 87*09537850SAkhilesh Sanikop 88*09537850SAkhilesh Sanikop//------------------------------------------------------------------------------ 89*09537850SAkhilesh Sanikop// Store functions. 90*09537850SAkhilesh Sanikop 91*09537850SAkhilesh Sanikopinline void StoreAligned32(void* a, const __m256i v) { 92*09537850SAkhilesh Sanikop assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 93*09537850SAkhilesh Sanikop _mm256_store_si256(static_cast<__m256i*>(a), v); 94*09537850SAkhilesh Sanikop} 95*09537850SAkhilesh Sanikop 96*09537850SAkhilesh Sanikopinline void StoreAligned64(void* a, const __m256i v[2]) { 97*09537850SAkhilesh Sanikop assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); 98*09537850SAkhilesh Sanikop _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]); 99*09537850SAkhilesh Sanikop _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]); 100*09537850SAkhilesh Sanikop} 101*09537850SAkhilesh Sanikop 102*09537850SAkhilesh Sanikopinline void StoreUnaligned32(void* a, const __m256i v) { 103*09537850SAkhilesh Sanikop _mm256_storeu_si256(static_cast<__m256i*>(a), v); 104*09537850SAkhilesh Sanikop} 105*09537850SAkhilesh Sanikop 106*09537850SAkhilesh Sanikop//------------------------------------------------------------------------------ 107*09537850SAkhilesh Sanikop// Arithmetic utilities. 108*09537850SAkhilesh Sanikop 109*09537850SAkhilesh Sanikopinline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) { 110*09537850SAkhilesh Sanikop assert(bits <= 16); 111*09537850SAkhilesh Sanikop const __m256i v_bias_d = 112*09537850SAkhilesh Sanikop _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); 113*09537850SAkhilesh Sanikop const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d); 114*09537850SAkhilesh Sanikop return _mm256_srai_epi16(v_tmp_d, bits); 115*09537850SAkhilesh Sanikop} 116*09537850SAkhilesh Sanikop 117*09537850SAkhilesh Sanikopinline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) { 118*09537850SAkhilesh Sanikop const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1); 119*09537850SAkhilesh Sanikop const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d); 120*09537850SAkhilesh Sanikop return _mm256_srai_epi32(v_tmp_d, bits); 121*09537850SAkhilesh Sanikop} 122