xref: /aosp_15_r20/external/skia/src/core/SkMemset_opts_erms.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2020 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/base/SkMSAN.h"
9 #include "src/core/SkMemset.h"
10 #include <cstddef>
11 #include <cstdint>
12 
13 // memset16 and memset32 could work on 32-bit x86 too, but for simplicity just use this on x64
14 #if (defined(__x86_64__) || defined(_M_X64)) && !defined(SK_ENABLE_OPTIMIZE_SIZE)
15 
16 static const char* note = "MSAN can't see that repsto initializes memory.";
17 
18 #if defined(_MSC_VER)
19 #include <intrin.h>
repsto(uint16_t * dst,uint16_t v,size_t n)20 static inline void repsto(uint16_t* dst, uint16_t v, size_t n) {
21     sk_msan_mark_initialized(dst, dst + n, note);
22     __stosw(dst, v, n);
23 }
repsto(uint32_t * dst,uint32_t v,size_t n)24 static inline void repsto(uint32_t* dst, uint32_t v, size_t n) {
25     sk_msan_mark_initialized(dst, dst + n, note);
26     static_assert(sizeof(uint32_t) == sizeof(unsigned long));
27     __stosd(reinterpret_cast<unsigned long*>(dst), v, n);
28 }
repsto(uint64_t * dst,uint64_t v,size_t n)29 static inline void repsto(uint64_t* dst, uint64_t v, size_t n) {
30     sk_msan_mark_initialized(dst, dst + n, note);
31     __stosq(dst, v, n);
32 }
33 #else
repsto(uint16_t * dst,uint16_t v,size_t n)34 static inline void repsto(uint16_t* dst, uint16_t v, size_t n) {
35     sk_msan_mark_initialized(dst, dst + n, note);
36     asm volatile("rep stosw" : "+D"(dst), "+c"(n) : "a"(v) : "memory");
37 }
repsto(uint32_t * dst,uint32_t v,size_t n)38 static inline void repsto(uint32_t* dst, uint32_t v, size_t n) {
39     sk_msan_mark_initialized(dst, dst + n, note);
40     asm volatile("rep stosl" : "+D"(dst), "+c"(n) : "a"(v) : "memory");
41 }
repsto(uint64_t * dst,uint64_t v,size_t n)42 static inline void repsto(uint64_t* dst, uint64_t v, size_t n) {
43     sk_msan_mark_initialized(dst, dst + n, note);
44     asm volatile("rep stosq" : "+D"(dst), "+c"(n) : "a"(v) : "memory");
45 }
46 #endif
47 
48 // ERMS is ideal for large copies but has a relatively high setup cost,
49 // so we use the previous best routine for small inputs.  FSRM would make this moot.
50 static void (*g_memset16_prev)(uint16_t*, uint16_t, int);
51 static void (*g_memset32_prev)(uint32_t*, uint32_t, int);
52 static void (*g_memset64_prev)(uint64_t*, uint64_t, int);
53 static void (*g_rect_memset16_prev)(uint16_t*, uint16_t, int, size_t, int);
54 static void (*g_rect_memset32_prev)(uint32_t*, uint32_t, int, size_t, int);
55 static void (*g_rect_memset64_prev)(uint64_t*, uint64_t, int, size_t, int);
56 
57 // Empirically determined with `nanobench -m memset`.
small(size_t bytes)58 static bool small(size_t bytes) { return bytes < 1024; }
59 
60 namespace erms {
61 
memset16(uint16_t * dst,uint16_t v,int n)62 static inline void memset16(uint16_t* dst, uint16_t v, int n) {
63     return small(sizeof(v) * n) ? g_memset16_prev(dst, v, n) : repsto(dst, v, n);
64 }
memset32(uint32_t * dst,uint32_t v,int n)65 static inline void memset32(uint32_t* dst, uint32_t v, int n) {
66     return small(sizeof(v) * n) ? g_memset32_prev(dst, v, n) : repsto(dst, v, n);
67 }
memset64(uint64_t * dst,uint64_t v,int n)68 static inline void memset64(uint64_t* dst, uint64_t v, int n) {
69     return small(sizeof(v) * n) ? g_memset64_prev(dst, v, n) : repsto(dst, v, n);
70 }
71 
rect_memset16(uint16_t * dst,uint16_t v,int n,size_t rowBytes,int height)72 static inline void rect_memset16(uint16_t* dst, uint16_t v, int n, size_t rowBytes, int height) {
73     if (small(sizeof(v) * n)) {
74         return g_rect_memset16_prev(dst, v, n, rowBytes, height);
75     }
76     for (int stride = rowBytes / sizeof(v); height-- > 0; dst += stride) {
77         repsto(dst, v, n);
78     }
79 }
rect_memset32(uint32_t * dst,uint32_t v,int n,size_t rowBytes,int height)80 static inline void rect_memset32(uint32_t* dst, uint32_t v, int n, size_t rowBytes, int height) {
81     if (small(sizeof(v) * n)) {
82         return g_rect_memset32_prev(dst, v, n, rowBytes, height);
83     }
84     for (int stride = rowBytes / sizeof(v); height-- > 0; dst += stride) {
85         repsto(dst, v, n);
86     }
87 }
rect_memset64(uint64_t * dst,uint64_t v,int n,size_t rowBytes,int height)88 static inline void rect_memset64(uint64_t* dst, uint64_t v, int n, size_t rowBytes, int height) {
89     if (small(sizeof(v) * n)) {
90         return g_rect_memset64_prev(dst, v, n, rowBytes, height);
91     }
92     for (int stride = rowBytes / sizeof(v); height-- > 0; dst += stride) {
93         repsto(dst, v, n);
94     }
95 }
96 
97 }  // namespace erms
98 
99 #endif // X86_64 && !SK_ENABLE_OPTIMIZE_SIZE
100 
101 namespace SkOpts {
Init_Memset_erms()102     void Init_Memset_erms() {
103         #if (defined(__x86_64__) || defined(_M_X64)) && !defined(SK_ENABLE_OPTIMIZE_SIZE)
104             g_memset16_prev      = memset16;
105             g_memset32_prev      = memset32;
106             g_memset64_prev      = memset64;
107             g_rect_memset16_prev = rect_memset16;
108             g_rect_memset32_prev = rect_memset32;
109             g_rect_memset64_prev = rect_memset64;
110 
111             memset16      = erms::memset16;
112             memset32      = erms::memset32;
113             memset64      = erms::memset64;
114             rect_memset16 = erms::rect_memset16;
115             rect_memset32 = erms::rect_memset32;
116             rect_memset64 = erms::rect_memset64;
117         #endif  // X86_64 && !SK_ENABLE_OPTIMIZE_SIZE
118     }
119 }  // namespace SkOpts
120