1 // Copyright 2019 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include <memory>
6 #include <string>
7 #include <tuple>
8 #include <utility>
9 #include <vector>
10
11 #include <pybind11/pybind11.h>
12 #include <pybind11/stl.h>
13 #include "absl/strings/string_view.h"
14 #include "re2/filtered_re2.h"
15 #include "re2/re2.h"
16 #include "re2/set.h"
17
18 #ifdef _WIN32
19 #include <basetsd.h>
20 #define ssize_t SSIZE_T
21 #endif
22
23 namespace re2_python {
24
25 // This is conventional.
26 namespace py = pybind11;
27
28 // In terms of the pybind11 API, a py::buffer is merely a py::object that
29 // supports the buffer interface/protocol and you must explicitly request
30 // a py::buffer_info in order to access the actual bytes. Under the hood,
31 // the py::buffer_info manages a reference count to the py::buffer, so it
32 // must be constructed and subsequently destructed while holding the GIL.
FromBytes(const py::buffer_info & bytes)33 static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
34 char* data = reinterpret_cast<char*>(bytes.ptr);
35 ssize_t size = bytes.size;
36 return absl::string_view(data, size);
37 }
38
OneCharLen(const char * ptr)39 static inline int OneCharLen(const char* ptr) {
40 return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
41 }
42
43 // Helper function for when Python encodes str to bytes and then needs to
44 // convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
CharLenToBytes(py::buffer buffer,ssize_t pos,ssize_t len)45 ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
46 auto bytes = buffer.request();
47 auto text = FromBytes(bytes);
48 auto ptr = text.data() + pos;
49 auto end = text.data() + text.size();
50 while (ptr < end && len > 0) {
51 ptr += OneCharLen(ptr);
52 --len;
53 }
54 return ptr - (text.data() + pos);
55 }
56
57 // Helper function for when Python decodes bytes to str and then needs to
58 // convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
BytesToCharLen(py::buffer buffer,ssize_t pos,ssize_t endpos)59 ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
60 auto bytes = buffer.request();
61 auto text = FromBytes(bytes);
62 auto ptr = text.data() + pos;
63 auto end = text.data() + endpos;
64 ssize_t len = 0;
65 while (ptr < end) {
66 ptr += OneCharLen(ptr);
67 ++len;
68 }
69 return len;
70 }
71
RE2InitShim(py::buffer buffer,const RE2::Options & options)72 std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
73 const RE2::Options& options) {
74 auto bytes = buffer.request();
75 auto pattern = FromBytes(bytes);
76 return std::make_unique<RE2>(pattern, options);
77 }
78
RE2ErrorShim(const RE2 & self)79 py::bytes RE2ErrorShim(const RE2& self) {
80 // Return std::string as bytes. That is, without decoding to str.
81 return self.error();
82 }
83
RE2NamedCapturingGroupsShim(const RE2 & self)84 std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
85 const RE2& self) {
86 const int num_groups = self.NumberOfCapturingGroups();
87 std::vector<std::pair<py::bytes, int>> groups;
88 groups.reserve(num_groups);
89 for (const auto& it : self.NamedCapturingGroups()) {
90 groups.emplace_back(it.first, it.second);
91 }
92 return groups;
93 }
94
RE2ProgramFanoutShim(const RE2 & self)95 std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
96 std::vector<int> histogram;
97 self.ProgramFanout(&histogram);
98 return histogram;
99 }
100
RE2ReverseProgramFanoutShim(const RE2 & self)101 std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
102 std::vector<int> histogram;
103 self.ReverseProgramFanout(&histogram);
104 return histogram;
105 }
106
RE2PossibleMatchRangeShim(const RE2 & self,int maxlen)107 std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
108 const RE2& self, int maxlen) {
109 std::string min, max;
110 // Return std::string as bytes. That is, without decoding to str.
111 return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
112 }
113
RE2MatchShim(const RE2 & self,RE2::Anchor anchor,py::buffer buffer,ssize_t pos,ssize_t endpos)114 std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
115 RE2::Anchor anchor,
116 py::buffer buffer,
117 ssize_t pos,
118 ssize_t endpos) {
119 auto bytes = buffer.request();
120 auto text = FromBytes(bytes);
121 const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0
122 std::vector<absl::string_view> groups;
123 groups.resize(num_groups);
124 py::gil_scoped_release release_gil;
125 if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
126 // Ensure that groups are null before converting to spans!
127 for (auto& it : groups) {
128 it = absl::string_view();
129 }
130 }
131 std::vector<std::pair<ssize_t, ssize_t>> spans;
132 spans.reserve(num_groups);
133 for (const auto& it : groups) {
134 if (it.data() == NULL) {
135 spans.emplace_back(-1, -1);
136 } else {
137 spans.emplace_back(it.data() - text.data(),
138 it.data() - text.data() + it.size());
139 }
140 }
141 return spans;
142 }
143
RE2QuoteMetaShim(py::buffer buffer)144 py::bytes RE2QuoteMetaShim(py::buffer buffer) {
145 auto bytes = buffer.request();
146 auto pattern = FromBytes(bytes);
147 // Return std::string as bytes. That is, without decoding to str.
148 return RE2::QuoteMeta(pattern);
149 }
150
151 class Set {
152 public:
Set(RE2::Anchor anchor,const RE2::Options & options)153 Set(RE2::Anchor anchor, const RE2::Options& options)
154 : set_(options, anchor) {}
155
156 ~Set() = default;
157
158 // Not copyable or movable.
159 Set(const Set&) = delete;
160 Set& operator=(const Set&) = delete;
161
Add(py::buffer buffer)162 int Add(py::buffer buffer) {
163 auto bytes = buffer.request();
164 auto pattern = FromBytes(bytes);
165 int index = set_.Add(pattern, /*error=*/NULL); // -1 on error
166 return index;
167 }
168
Compile()169 bool Compile() {
170 // Compiling can fail.
171 return set_.Compile();
172 }
173
Match(py::buffer buffer) const174 std::vector<int> Match(py::buffer buffer) const {
175 auto bytes = buffer.request();
176 auto text = FromBytes(bytes);
177 std::vector<int> matches;
178 py::gil_scoped_release release_gil;
179 set_.Match(text, &matches);
180 return matches;
181 }
182
183 private:
184 RE2::Set set_;
185 };
186
187 class Filter {
188 public:
189 Filter() = default;
190 ~Filter() = default;
191
192 // Not copyable or movable.
193 Filter(const Filter&) = delete;
194 Filter& operator=(const Filter&) = delete;
195
Add(py::buffer buffer,const RE2::Options & options)196 int Add(py::buffer buffer, const RE2::Options& options) {
197 auto bytes = buffer.request();
198 auto pattern = FromBytes(bytes);
199 int index = -1; // not clobbered on error
200 filter_.Add(pattern, options, &index);
201 return index;
202 }
203
Compile()204 bool Compile() {
205 std::vector<std::string> atoms;
206 filter_.Compile(&atoms);
207 RE2::Options options;
208 options.set_literal(true);
209 options.set_case_sensitive(false);
210 set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
211 for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
212 if (set_->Add(atoms[i], /*error=*/NULL) != i) {
213 // Should never happen: the atom is a literal!
214 py::pybind11_fail("set_->Add() failed");
215 }
216 }
217 // Compiling can fail.
218 return set_->Compile();
219 }
220
Match(py::buffer buffer,bool potential) const221 std::vector<int> Match(py::buffer buffer, bool potential) const {
222 if (set_ == nullptr) {
223 py::pybind11_fail("Match() called before compiling");
224 }
225
226 auto bytes = buffer.request();
227 auto text = FromBytes(bytes);
228 std::vector<int> atoms;
229 py::gil_scoped_release release_gil;
230 set_->Match(text, &atoms);
231 std::vector<int> matches;
232 if (potential) {
233 filter_.AllPotentials(atoms, &matches);
234 } else {
235 filter_.AllMatches(text, atoms, &matches);
236 }
237 return matches;
238 }
239
GetRE2(int index) const240 const RE2& GetRE2(int index) const {
241 return filter_.GetRE2(index);
242 }
243
244 private:
245 re2::FilteredRE2 filter_;
246 std::unique_ptr<RE2::Set> set_;
247 };
248
PYBIND11_MODULE(_re2,module)249 PYBIND11_MODULE(_re2, module) {
250 // Translate exceptions thrown by py::pybind11_fail() into Python.
251 py::register_local_exception<std::runtime_error>(module, "Error");
252
253 module.def("CharLenToBytes", &CharLenToBytes);
254 module.def("BytesToCharLen", &BytesToCharLen);
255
256 // CLASSES
257 // class RE2
258 // enum Anchor
259 // class Options
260 // enum Encoding
261 // class Set
262 // class Filter
263 py::class_<RE2> re2(module, "RE2");
264 py::enum_<RE2::Anchor> anchor(re2, "Anchor");
265 py::class_<RE2::Options> options(re2, "Options");
266 py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
267 py::class_<Set> set(module, "Set");
268 py::class_<Filter> filter(module, "Filter");
269
270 anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
271 anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
272 anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
273
274 encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
275 encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
276
277 options.def(py::init<>())
278 .def_property("max_mem", //
279 &RE2::Options::max_mem, //
280 &RE2::Options::set_max_mem) //
281 .def_property("encoding", //
282 &RE2::Options::encoding, //
283 &RE2::Options::set_encoding) //
284 .def_property("posix_syntax", //
285 &RE2::Options::posix_syntax, //
286 &RE2::Options::set_posix_syntax) //
287 .def_property("longest_match", //
288 &RE2::Options::longest_match, //
289 &RE2::Options::set_longest_match) //
290 .def_property("log_errors", //
291 &RE2::Options::log_errors, //
292 &RE2::Options::set_log_errors) //
293 .def_property("literal", //
294 &RE2::Options::literal, //
295 &RE2::Options::set_literal) //
296 .def_property("never_nl", //
297 &RE2::Options::never_nl, //
298 &RE2::Options::set_never_nl) //
299 .def_property("dot_nl", //
300 &RE2::Options::dot_nl, //
301 &RE2::Options::set_dot_nl) //
302 .def_property("never_capture", //
303 &RE2::Options::never_capture, //
304 &RE2::Options::set_never_capture) //
305 .def_property("case_sensitive", //
306 &RE2::Options::case_sensitive, //
307 &RE2::Options::set_case_sensitive) //
308 .def_property("perl_classes", //
309 &RE2::Options::perl_classes, //
310 &RE2::Options::set_perl_classes) //
311 .def_property("word_boundary", //
312 &RE2::Options::word_boundary, //
313 &RE2::Options::set_word_boundary) //
314 .def_property("one_line", //
315 &RE2::Options::one_line, //
316 &RE2::Options::set_one_line); //
317
318 re2.def(py::init(&RE2InitShim))
319 .def("ok", &RE2::ok)
320 .def("error", &RE2ErrorShim)
321 .def("options", &RE2::options)
322 .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
323 .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
324 .def("ProgramSize", &RE2::ProgramSize)
325 .def("ReverseProgramSize", &RE2::ReverseProgramSize)
326 .def("ProgramFanout", &RE2ProgramFanoutShim)
327 .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
328 .def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
329 .def("Match", &RE2MatchShim)
330 .def_static("QuoteMeta", &RE2QuoteMetaShim);
331
332 set.def(py::init<RE2::Anchor, const RE2::Options&>())
333 .def("Add", &Set::Add)
334 .def("Compile", &Set::Compile)
335 .def("Match", &Set::Match);
336
337 filter.def(py::init<>())
338 .def("Add", &Filter::Add)
339 .def("Compile", &Filter::Compile)
340 .def("Match", &Filter::Match)
341 .def("GetRE2", &Filter::GetRE2,
342 py::return_value_policy::reference_internal);
343 }
344
345 } // namespace re2_python
346