xref: /aosp_15_r20/external/pytorch/torch/csrc/profiler/containers.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #pragma once
2 
3 #include <algorithm>
4 #include <array>
5 #include <cstddef>
6 #include <cstdint>
7 #include <forward_list>
8 #include <new>
9 #include <utility>
10 
11 #include <c10/macros/Macros.h>
12 #include <c10/util/ArrayRef.h>
13 #include <c10/util/Exception.h>
14 
15 namespace torch::profiler::impl {
16 
17 // ============================================================================
18 // == AppendOnlyList ==========================================================
19 // ============================================================================
20 //   During profiling, we have a very predictable access pattern: we only
21 // append to the end of the container. We can specialize and outperform both
22 // std::vector (which must realloc) and std::deque (which performs a double
23 // indirection), and this class of operation is sufficiently important to the
24 // profiling hot path to warrant specializing:
25 //   https://godbolt.org/z/rTjozf1c4
26 //   https://quick-bench.com/q/mmfuu71ogwaiULDCJyHdKnHZms4    (Prototype #1,
27 //   int) https://quick-bench.com/q/5vWDW6jjdXVdoffev2zst8D09no    (Prototype
28 //   #1, int pair) https://quick-bench.com/q/IfEkfAQMeJSNBA52xtMP6Agcl-Q
29 //   (Prototype #2, int pair)
30 //   https://quick-bench.com/q/wJV2lKmuXL4XyGJzcI5hs4gEHFg    (Prototype #3, int
31 //   pair) https://quick-bench.com/q/xiO8ZaBEkYRYUA9dFrMuPLlW9fo    (Full impl,
32 //   int pair)
33 // AppendOnlyList has 2x lower emplace overhead compared to more generic STL
34 // containers.
35 //
36 //   The optimal value of `ChunkSize` will vary by use case, but testing shows
37 // that a value of 1024 does a good job amortizing the `malloc` cost of growth.
38 // Performance drops off for larger values, so testing on a case-by-case basis
39 // is recommended if performance is absolutely critical.
40 
41 template <
42     typename T,
43     size_t ChunkSize,
44     template <typename U, size_t N> class block_t = std::array>
45 class AppendOnlyList {
46  public:
47   using array_t = block_t<T, ChunkSize>;
48   static_assert(
49       std::is_base_of_v<std::array<T, ChunkSize>, array_t>,
50       "AppendOnlyList expects raw low level pointer storage.");
51   static_assert(ChunkSize > 0, "Block cannot be empty.");
52 
AppendOnlyList()53   AppendOnlyList() : buffer_last_{buffer_.before_begin()} {}
54   AppendOnlyList(const AppendOnlyList&) = delete;
55   AppendOnlyList& operator=(const AppendOnlyList&) = delete;
56 
size()57   size_t size() const {
58     return n_blocks_ * ChunkSize - (size_t)(end_ - next_);
59   }
60 
61   template <class... Args>
emplace_back(Args &&...args)62   T* emplace_back(Args&&... args) {
63     maybe_grow();
64     if constexpr (
65         std::is_trivially_destructible_v<T> &&
66         std::is_trivially_destructible_v<array_t>) {
67       ::new ((void*)next_) T{std::forward<Args>(args)...};
68     } else {
69       *next_ = T{std::forward<Args>(args)...};
70     }
71     return next_++;
72   }
73 
74   template <typename T0>
75   std::enable_if_t<std::is_same_v<T0, T> && std::is_trivially_copyable_v<T>>
copy(c10::ArrayRef<T0> src)76   copy(c10::ArrayRef<T0> src) {
77     size_t n = src.size();
78     if (C10_UNLIKELY(n == 0)) {
79       return;
80     }
81     maybe_grow();
82     if (C10_LIKELY(next_ && (next_ + n <= end_))) {
83       std::memcpy((void*)next_, (void*)src.begin(), n * sizeof(T0));
84       next_ += n;
85     } else {
86       // We could chunk this into several `memcpy`s, but because we expect this
87       // fallback to be infrequent (n << ChunkSize) the performance impact is
88       // negligible.
89       for (auto i : src) {
90         emplace_back(i);
91       }
92     }
93   }
94 
clear()95   void clear() {
96     buffer_.clear();
97     buffer_last_ = buffer_.before_begin();
98     n_blocks_ = 0;
99     next_ = nullptr;
100     end_ = nullptr;
101   }
102 
103   struct Iterator {
104     using iterator_category = std::forward_iterator_tag;
105     using difference_type = std::ptrdiff_t;
106     using value_type = T;
107     using pointer = T*;
108     using reference = T&;
109 
IteratorIterator110     Iterator(std::forward_list<array_t>& buffer, const size_t size)
111         : block_{buffer.begin()}, size_{size} {}
112 
113     // End iterator.
114     Iterator() = default;
115 
exhaustedIterator116     bool exhausted() const {
117       return current_ >= size_;
118     }
119 
120     reference operator*() const {
121       return *current_ptr(/*checked=*/true);
122     }
123     pointer operator->() {
124       return current_ptr(/*checked=*/true);
125     }
126 
127     // Prefix increment
128     Iterator& operator++() {
129       if (!(++current_ % ChunkSize)) {
130         block_++;
131       }
132       return *this;
133     }
134 
135     // Postfix increment
136     Iterator operator++(int) {
137       Iterator tmp = *this;
138       ++(*this);
139       return tmp;
140     }
141 
142     friend bool operator==(const Iterator& a, const Iterator& b) {
143       return a.current_ptr() == b.current_ptr();
144     }
145     friend bool operator!=(const Iterator& a, const Iterator& b) {
146       return a.current_ptr() != b.current_ptr();
147     }
148 
addressIterator149     std::pair<array_t*, size_t> address() const {
150       if (current_ >= size_) {
151         return {nullptr, 0};
152       }
153       return {&(*block_), current_ % ChunkSize};
154     }
155 
156    private:
157     T* current_ptr(bool checked = false) const {
158       auto a = address();
159       if (a.first == nullptr) {
160         TORCH_INTERNAL_ASSERT(!checked, "Invalid access on AppendOnlyList.");
161         return nullptr;
162       }
163       return a.first->data() + a.second;
164     }
165 
166     typename std::forward_list<array_t>::iterator block_;
167     size_t current_{0};
168     size_t size_{0};
169   };
170 
begin()171   Iterator begin() {
172     return Iterator(buffer_, size());
173   }
end()174   Iterator end() {
175     return Iterator();
176   }
177   // TODO: cbegin and cend()
178 
179  private:
maybe_grow()180   void maybe_grow() {
181     if (C10_UNLIKELY(next_ == end_)) {
182       buffer_last_ = buffer_.emplace_after(buffer_last_);
183       n_blocks_++;
184       next_ = buffer_last_->data();
185       end_ = next_ + ChunkSize;
186     }
187   }
188 
189   std::forward_list<array_t> buffer_;
190 
191   // We maintain a pointer to the last element of `buffer_` so that we can
192   // insert at the end in O(1) time.
193   size_t n_blocks_{0};
194   T* next_{nullptr};
195   T* end_{nullptr};
196 
197  protected:
198   typename std::forward_list<array_t>::iterator buffer_last_;
199 };
200 
201 } // namespace torch::profiler::impl
202