xref: /aosp_15_r20/external/cronet/third_party/re2/src/python/_re2.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2019 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <memory>
6 #include <string>
7 #include <tuple>
8 #include <utility>
9 #include <vector>
10 
11 #include <pybind11/pybind11.h>
12 #include <pybind11/stl.h>
13 #include "absl/strings/string_view.h"
14 #include "re2/filtered_re2.h"
15 #include "re2/re2.h"
16 #include "re2/set.h"
17 
18 #ifdef _WIN32
19 #include <basetsd.h>
20 #define ssize_t SSIZE_T
21 #endif
22 
23 namespace re2_python {
24 
25 // This is conventional.
26 namespace py = pybind11;
27 
28 // In terms of the pybind11 API, a py::buffer is merely a py::object that
29 // supports the buffer interface/protocol and you must explicitly request
30 // a py::buffer_info in order to access the actual bytes. Under the hood,
31 // the py::buffer_info manages a reference count to the py::buffer, so it
32 // must be constructed and subsequently destructed while holding the GIL.
FromBytes(const py::buffer_info & bytes)33 static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
34   char* data = reinterpret_cast<char*>(bytes.ptr);
35   ssize_t size = bytes.size;
36   return absl::string_view(data, size);
37 }
38 
OneCharLen(const char * ptr)39 static inline int OneCharLen(const char* ptr) {
40   return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
41 }
42 
43 // Helper function for when Python encodes str to bytes and then needs to
44 // convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
CharLenToBytes(py::buffer buffer,ssize_t pos,ssize_t len)45 ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
46   auto bytes = buffer.request();
47   auto text = FromBytes(bytes);
48   auto ptr = text.data() + pos;
49   auto end = text.data() + text.size();
50   while (ptr < end && len > 0) {
51     ptr += OneCharLen(ptr);
52     --len;
53   }
54   return ptr - (text.data() + pos);
55 }
56 
57 // Helper function for when Python decodes bytes to str and then needs to
58 // convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
BytesToCharLen(py::buffer buffer,ssize_t pos,ssize_t endpos)59 ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
60   auto bytes = buffer.request();
61   auto text = FromBytes(bytes);
62   auto ptr = text.data() + pos;
63   auto end = text.data() + endpos;
64   ssize_t len = 0;
65   while (ptr < end) {
66     ptr += OneCharLen(ptr);
67     ++len;
68   }
69   return len;
70 }
71 
RE2InitShim(py::buffer buffer,const RE2::Options & options)72 std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
73                                  const RE2::Options& options) {
74   auto bytes = buffer.request();
75   auto pattern = FromBytes(bytes);
76   return std::make_unique<RE2>(pattern, options);
77 }
78 
RE2ErrorShim(const RE2 & self)79 py::bytes RE2ErrorShim(const RE2& self) {
80   // Return std::string as bytes. That is, without decoding to str.
81   return self.error();
82 }
83 
RE2NamedCapturingGroupsShim(const RE2 & self)84 std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
85     const RE2& self) {
86   const int num_groups = self.NumberOfCapturingGroups();
87   std::vector<std::pair<py::bytes, int>> groups;
88   groups.reserve(num_groups);
89   for (const auto& it : self.NamedCapturingGroups()) {
90     groups.emplace_back(it.first, it.second);
91   }
92   return groups;
93 }
94 
RE2ProgramFanoutShim(const RE2 & self)95 std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
96   std::vector<int> histogram;
97   self.ProgramFanout(&histogram);
98   return histogram;
99 }
100 
RE2ReverseProgramFanoutShim(const RE2 & self)101 std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
102   std::vector<int> histogram;
103   self.ReverseProgramFanout(&histogram);
104   return histogram;
105 }
106 
RE2PossibleMatchRangeShim(const RE2 & self,int maxlen)107 std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
108     const RE2& self, int maxlen) {
109   std::string min, max;
110   // Return std::string as bytes. That is, without decoding to str.
111   return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
112 }
113 
RE2MatchShim(const RE2 & self,RE2::Anchor anchor,py::buffer buffer,ssize_t pos,ssize_t endpos)114 std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
115                                                       RE2::Anchor anchor,
116                                                       py::buffer buffer,
117                                                       ssize_t pos,
118                                                       ssize_t endpos) {
119   auto bytes = buffer.request();
120   auto text = FromBytes(bytes);
121   const int num_groups = self.NumberOfCapturingGroups() + 1;  // need $0
122   std::vector<absl::string_view> groups;
123   groups.resize(num_groups);
124   py::gil_scoped_release release_gil;
125   if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
126     // Ensure that groups are null before converting to spans!
127     for (auto& it : groups) {
128       it = absl::string_view();
129     }
130   }
131   std::vector<std::pair<ssize_t, ssize_t>> spans;
132   spans.reserve(num_groups);
133   for (const auto& it : groups) {
134     if (it.data() == NULL) {
135       spans.emplace_back(-1, -1);
136     } else {
137       spans.emplace_back(it.data() - text.data(),
138                          it.data() - text.data() + it.size());
139     }
140   }
141   return spans;
142 }
143 
RE2QuoteMetaShim(py::buffer buffer)144 py::bytes RE2QuoteMetaShim(py::buffer buffer) {
145   auto bytes = buffer.request();
146   auto pattern = FromBytes(bytes);
147   // Return std::string as bytes. That is, without decoding to str.
148   return RE2::QuoteMeta(pattern);
149 }
150 
151 class Set {
152  public:
Set(RE2::Anchor anchor,const RE2::Options & options)153   Set(RE2::Anchor anchor, const RE2::Options& options)
154       : set_(options, anchor) {}
155 
156   ~Set() = default;
157 
158   // Not copyable or movable.
159   Set(const Set&) = delete;
160   Set& operator=(const Set&) = delete;
161 
Add(py::buffer buffer)162   int Add(py::buffer buffer) {
163     auto bytes = buffer.request();
164     auto pattern = FromBytes(bytes);
165     int index = set_.Add(pattern, /*error=*/NULL);  // -1 on error
166     return index;
167   }
168 
Compile()169   bool Compile() {
170     // Compiling can fail.
171     return set_.Compile();
172   }
173 
Match(py::buffer buffer) const174   std::vector<int> Match(py::buffer buffer) const {
175     auto bytes = buffer.request();
176     auto text = FromBytes(bytes);
177     std::vector<int> matches;
178     py::gil_scoped_release release_gil;
179     set_.Match(text, &matches);
180     return matches;
181   }
182 
183  private:
184   RE2::Set set_;
185 };
186 
187 class Filter {
188  public:
189   Filter() = default;
190   ~Filter() = default;
191 
192   // Not copyable or movable.
193   Filter(const Filter&) = delete;
194   Filter& operator=(const Filter&) = delete;
195 
Add(py::buffer buffer,const RE2::Options & options)196   int Add(py::buffer buffer, const RE2::Options& options) {
197     auto bytes = buffer.request();
198     auto pattern = FromBytes(bytes);
199     int index = -1;  // not clobbered on error
200     filter_.Add(pattern, options, &index);
201     return index;
202   }
203 
Compile()204   bool Compile() {
205     std::vector<std::string> atoms;
206     filter_.Compile(&atoms);
207     RE2::Options options;
208     options.set_literal(true);
209     options.set_case_sensitive(false);
210     set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
211     for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
212       if (set_->Add(atoms[i], /*error=*/NULL) != i) {
213         // Should never happen: the atom is a literal!
214         py::pybind11_fail("set_->Add() failed");
215       }
216     }
217     // Compiling can fail.
218     return set_->Compile();
219   }
220 
Match(py::buffer buffer,bool potential) const221   std::vector<int> Match(py::buffer buffer, bool potential) const {
222     if (set_ == nullptr) {
223       py::pybind11_fail("Match() called before compiling");
224     }
225 
226     auto bytes = buffer.request();
227     auto text = FromBytes(bytes);
228     std::vector<int> atoms;
229     py::gil_scoped_release release_gil;
230     set_->Match(text, &atoms);
231     std::vector<int> matches;
232     if (potential) {
233       filter_.AllPotentials(atoms, &matches);
234     } else {
235       filter_.AllMatches(text, atoms, &matches);
236     }
237     return matches;
238   }
239 
GetRE2(int index) const240   const RE2& GetRE2(int index) const {
241     return filter_.GetRE2(index);
242   }
243 
244  private:
245   re2::FilteredRE2 filter_;
246   std::unique_ptr<RE2::Set> set_;
247 };
248 
PYBIND11_MODULE(_re2,module)249 PYBIND11_MODULE(_re2, module) {
250   // Translate exceptions thrown by py::pybind11_fail() into Python.
251   py::register_local_exception<std::runtime_error>(module, "Error");
252 
253   module.def("CharLenToBytes", &CharLenToBytes);
254   module.def("BytesToCharLen", &BytesToCharLen);
255 
256   // CLASSES
257   //     class RE2
258   //         enum Anchor
259   //         class Options
260   //             enum Encoding
261   //     class Set
262   //     class Filter
263   py::class_<RE2> re2(module, "RE2");
264   py::enum_<RE2::Anchor> anchor(re2, "Anchor");
265   py::class_<RE2::Options> options(re2, "Options");
266   py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
267   py::class_<Set> set(module, "Set");
268   py::class_<Filter> filter(module, "Filter");
269 
270   anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
271   anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
272   anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
273 
274   encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
275   encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
276 
277   options.def(py::init<>())
278       .def_property("max_mem",                          //
279                     &RE2::Options::max_mem,             //
280                     &RE2::Options::set_max_mem)         //
281       .def_property("encoding",                         //
282                     &RE2::Options::encoding,            //
283                     &RE2::Options::set_encoding)        //
284       .def_property("posix_syntax",                     //
285                     &RE2::Options::posix_syntax,        //
286                     &RE2::Options::set_posix_syntax)    //
287       .def_property("longest_match",                    //
288                     &RE2::Options::longest_match,       //
289                     &RE2::Options::set_longest_match)   //
290       .def_property("log_errors",                       //
291                     &RE2::Options::log_errors,          //
292                     &RE2::Options::set_log_errors)      //
293       .def_property("literal",                          //
294                     &RE2::Options::literal,             //
295                     &RE2::Options::set_literal)         //
296       .def_property("never_nl",                         //
297                     &RE2::Options::never_nl,            //
298                     &RE2::Options::set_never_nl)        //
299       .def_property("dot_nl",                           //
300                     &RE2::Options::dot_nl,              //
301                     &RE2::Options::set_dot_nl)          //
302       .def_property("never_capture",                    //
303                     &RE2::Options::never_capture,       //
304                     &RE2::Options::set_never_capture)   //
305       .def_property("case_sensitive",                   //
306                     &RE2::Options::case_sensitive,      //
307                     &RE2::Options::set_case_sensitive)  //
308       .def_property("perl_classes",                     //
309                     &RE2::Options::perl_classes,        //
310                     &RE2::Options::set_perl_classes)    //
311       .def_property("word_boundary",                    //
312                     &RE2::Options::word_boundary,       //
313                     &RE2::Options::set_word_boundary)   //
314       .def_property("one_line",                         //
315                     &RE2::Options::one_line,            //
316                     &RE2::Options::set_one_line);       //
317 
318   re2.def(py::init(&RE2InitShim))
319       .def("ok", &RE2::ok)
320       .def("error", &RE2ErrorShim)
321       .def("options", &RE2::options)
322       .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
323       .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
324       .def("ProgramSize", &RE2::ProgramSize)
325       .def("ReverseProgramSize", &RE2::ReverseProgramSize)
326       .def("ProgramFanout", &RE2ProgramFanoutShim)
327       .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
328       .def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
329       .def("Match", &RE2MatchShim)
330       .def_static("QuoteMeta", &RE2QuoteMetaShim);
331 
332   set.def(py::init<RE2::Anchor, const RE2::Options&>())
333       .def("Add", &Set::Add)
334       .def("Compile", &Set::Compile)
335       .def("Match", &Set::Match);
336 
337   filter.def(py::init<>())
338       .def("Add", &Filter::Add)
339       .def("Compile", &Filter::Compile)
340       .def("Match", &Filter::Match)
341       .def("GetRE2", &Filter::GetRE2,
342            py::return_value_policy::reference_internal);
343 }
344 
345 }  // namespace re2_python
346