1 #pragma once 2 3 #include <algorithm> 4 #include <array> 5 #include <cstddef> 6 #include <cstdint> 7 #include <forward_list> 8 #include <new> 9 #include <utility> 10 11 #include <c10/macros/Macros.h> 12 #include <c10/util/ArrayRef.h> 13 #include <c10/util/Exception.h> 14 15 namespace torch::profiler::impl { 16 17 // ============================================================================ 18 // == AppendOnlyList ========================================================== 19 // ============================================================================ 20 // During profiling, we have a very predictable access pattern: we only 21 // append to the end of the container. We can specialize and outperform both 22 // std::vector (which must realloc) and std::deque (which performs a double 23 // indirection), and this class of operation is sufficiently important to the 24 // profiling hot path to warrant specializing: 25 // https://godbolt.org/z/rTjozf1c4 26 // https://quick-bench.com/q/mmfuu71ogwaiULDCJyHdKnHZms4 (Prototype #1, 27 // int) https://quick-bench.com/q/5vWDW6jjdXVdoffev2zst8D09no (Prototype 28 // #1, int pair) https://quick-bench.com/q/IfEkfAQMeJSNBA52xtMP6Agcl-Q 29 // (Prototype #2, int pair) 30 // https://quick-bench.com/q/wJV2lKmuXL4XyGJzcI5hs4gEHFg (Prototype #3, int 31 // pair) https://quick-bench.com/q/xiO8ZaBEkYRYUA9dFrMuPLlW9fo (Full impl, 32 // int pair) 33 // AppendOnlyList has 2x lower emplace overhead compared to more generic STL 34 // containers. 35 // 36 // The optimal value of `ChunkSize` will vary by use case, but testing shows 37 // that a value of 1024 does a good job amortizing the `malloc` cost of growth. 38 // Performance drops off for larger values, so testing on a case-by-case basis 39 // is recommended if performance is absolutely critical. 40 41 template < 42 typename T, 43 size_t ChunkSize, 44 template <typename U, size_t N> class block_t = std::array> 45 class AppendOnlyList { 46 public: 47 using array_t = block_t<T, ChunkSize>; 48 static_assert( 49 std::is_base_of_v<std::array<T, ChunkSize>, array_t>, 50 "AppendOnlyList expects raw low level pointer storage."); 51 static_assert(ChunkSize > 0, "Block cannot be empty."); 52 AppendOnlyList()53 AppendOnlyList() : buffer_last_{buffer_.before_begin()} {} 54 AppendOnlyList(const AppendOnlyList&) = delete; 55 AppendOnlyList& operator=(const AppendOnlyList&) = delete; 56 size()57 size_t size() const { 58 return n_blocks_ * ChunkSize - (size_t)(end_ - next_); 59 } 60 61 template <class... Args> emplace_back(Args &&...args)62 T* emplace_back(Args&&... args) { 63 maybe_grow(); 64 if constexpr ( 65 std::is_trivially_destructible_v<T> && 66 std::is_trivially_destructible_v<array_t>) { 67 ::new ((void*)next_) T{std::forward<Args>(args)...}; 68 } else { 69 *next_ = T{std::forward<Args>(args)...}; 70 } 71 return next_++; 72 } 73 74 template <typename T0> 75 std::enable_if_t<std::is_same_v<T0, T> && std::is_trivially_copyable_v<T>> copy(c10::ArrayRef<T0> src)76 copy(c10::ArrayRef<T0> src) { 77 size_t n = src.size(); 78 if (C10_UNLIKELY(n == 0)) { 79 return; 80 } 81 maybe_grow(); 82 if (C10_LIKELY(next_ && (next_ + n <= end_))) { 83 std::memcpy((void*)next_, (void*)src.begin(), n * sizeof(T0)); 84 next_ += n; 85 } else { 86 // We could chunk this into several `memcpy`s, but because we expect this 87 // fallback to be infrequent (n << ChunkSize) the performance impact is 88 // negligible. 89 for (auto i : src) { 90 emplace_back(i); 91 } 92 } 93 } 94 clear()95 void clear() { 96 buffer_.clear(); 97 buffer_last_ = buffer_.before_begin(); 98 n_blocks_ = 0; 99 next_ = nullptr; 100 end_ = nullptr; 101 } 102 103 struct Iterator { 104 using iterator_category = std::forward_iterator_tag; 105 using difference_type = std::ptrdiff_t; 106 using value_type = T; 107 using pointer = T*; 108 using reference = T&; 109 IteratorIterator110 Iterator(std::forward_list<array_t>& buffer, const size_t size) 111 : block_{buffer.begin()}, size_{size} {} 112 113 // End iterator. 114 Iterator() = default; 115 exhaustedIterator116 bool exhausted() const { 117 return current_ >= size_; 118 } 119 120 reference operator*() const { 121 return *current_ptr(/*checked=*/true); 122 } 123 pointer operator->() { 124 return current_ptr(/*checked=*/true); 125 } 126 127 // Prefix increment 128 Iterator& operator++() { 129 if (!(++current_ % ChunkSize)) { 130 block_++; 131 } 132 return *this; 133 } 134 135 // Postfix increment 136 Iterator operator++(int) { 137 Iterator tmp = *this; 138 ++(*this); 139 return tmp; 140 } 141 142 friend bool operator==(const Iterator& a, const Iterator& b) { 143 return a.current_ptr() == b.current_ptr(); 144 } 145 friend bool operator!=(const Iterator& a, const Iterator& b) { 146 return a.current_ptr() != b.current_ptr(); 147 } 148 addressIterator149 std::pair<array_t*, size_t> address() const { 150 if (current_ >= size_) { 151 return {nullptr, 0}; 152 } 153 return {&(*block_), current_ % ChunkSize}; 154 } 155 156 private: 157 T* current_ptr(bool checked = false) const { 158 auto a = address(); 159 if (a.first == nullptr) { 160 TORCH_INTERNAL_ASSERT(!checked, "Invalid access on AppendOnlyList."); 161 return nullptr; 162 } 163 return a.first->data() + a.second; 164 } 165 166 typename std::forward_list<array_t>::iterator block_; 167 size_t current_{0}; 168 size_t size_{0}; 169 }; 170 begin()171 Iterator begin() { 172 return Iterator(buffer_, size()); 173 } end()174 Iterator end() { 175 return Iterator(); 176 } 177 // TODO: cbegin and cend() 178 179 private: maybe_grow()180 void maybe_grow() { 181 if (C10_UNLIKELY(next_ == end_)) { 182 buffer_last_ = buffer_.emplace_after(buffer_last_); 183 n_blocks_++; 184 next_ = buffer_last_->data(); 185 end_ = next_ + ChunkSize; 186 } 187 } 188 189 std::forward_list<array_t> buffer_; 190 191 // We maintain a pointer to the last element of `buffer_` so that we can 192 // insert at the end in O(1) time. 193 size_t n_blocks_{0}; 194 T* next_{nullptr}; 195 T* end_{nullptr}; 196 197 protected: 198 typename std::forward_list<array_t>::iterator buffer_last_; 199 }; 200 201 } // namespace torch::profiler::impl 202