1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpagefind.h"
8
9 #include <wchar.h>
10
11 #include <vector>
12
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_extension.h"
15 #include "core/fxcrt/fx_string.h"
16 #include "core/fxcrt/fx_system.h"
17 #include "core/fxcrt/fx_unicode.h"
18 #include "core/fxcrt/stl_util.h"
19 #include "third_party/base/check.h"
20 #include "third_party/base/memory/ptr_util.h"
21
22 namespace {
23
24 constexpr wchar_t kNonBreakingSpace = 160;
25
IsIgnoreSpaceCharacter(wchar_t curChar)26 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
27 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
28 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
29 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
30 (curChar >= 0x0400 && curChar <= 0x04FF) ||
31 (curChar >= 0x0500 && curChar <= 0x052F) ||
32 (curChar >= 0xA640 && curChar <= 0xA69F) ||
33 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
34 (curChar >= 0x2000 && curChar <= 0x206F)) {
35 return false;
36 }
37 return true;
38 }
39
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)40 bool IsMatchWholeWord(const WideString& csPageText,
41 size_t startPos,
42 size_t endPos) {
43 if (startPos > endPos)
44 return false;
45 wchar_t char_left = 0;
46 wchar_t char_right = 0;
47 size_t char_count = endPos - startPos + 1;
48 if (char_count == 0)
49 return false;
50 if (char_count == 1 && csPageText[startPos] > 255)
51 return true;
52 if (startPos >= 1)
53 char_left = csPageText[startPos - 1];
54 if (startPos + char_count < csPageText.GetLength())
55 char_right = csPageText[startPos + char_count];
56 if ((char_left > 'A' && char_left < 'a') ||
57 (char_left > 'a' && char_left < 'z') ||
58 (char_left > 0xfb00 && char_left < 0xfb06) ||
59 FXSYS_IsDecimalDigit(char_left) ||
60 (char_right > 'A' && char_right < 'a') ||
61 (char_right > 'a' && char_right < 'z') ||
62 (char_right > 0xfb00 && char_right < 0xfb06) ||
63 FXSYS_IsDecimalDigit(char_right)) {
64 return false;
65 }
66 if (!(('A' > char_left || char_left > 'Z') &&
67 ('a' > char_left || char_left > 'z') &&
68 ('A' > char_right || char_right > 'Z') &&
69 ('a' > char_right || char_right > 'z'))) {
70 return false;
71 }
72 if (char_count > 0) {
73 if (FXSYS_IsDecimalDigit(char_left) &&
74 FXSYS_IsDecimalDigit(csPageText[startPos])) {
75 return false;
76 }
77 if (FXSYS_IsDecimalDigit(char_right) &&
78 FXSYS_IsDecimalDigit(csPageText[endPos])) {
79 return false;
80 }
81 }
82 return true;
83 }
84
GetStringCase(const WideString & wsOriginal,bool bMatchCase)85 WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) {
86 if (bMatchCase)
87 return wsOriginal;
88
89 WideString wsLower = wsOriginal;
90 wsLower.MakeLower();
91 return wsLower;
92 }
93
ExtractSubString(const wchar_t * lpszFullString,int iSubString)94 absl::optional<WideString> ExtractSubString(const wchar_t* lpszFullString,
95 int iSubString) {
96 DCHECK(lpszFullString);
97
98 while (iSubString--) {
99 lpszFullString = wcschr(lpszFullString, L' ');
100 if (!lpszFullString)
101 return absl::nullopt;
102
103 lpszFullString++;
104 while (*lpszFullString == L' ')
105 lpszFullString++;
106 }
107
108 const wchar_t* lpchEnd = wcschr(lpszFullString, L' ');
109 int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
110 : static_cast<int>(wcslen(lpszFullString));
111 if (nLen < 0)
112 return absl::nullopt;
113
114 return WideString(lpszFullString, static_cast<size_t>(nLen));
115 }
116
ExtractFindWhat(const WideString & findwhat)117 std::vector<WideString> ExtractFindWhat(const WideString& findwhat) {
118 std::vector<WideString> findwhat_array;
119
120 size_t len = findwhat.GetLength();
121 size_t i = 0;
122 for (i = 0; i < len; ++i)
123 if (findwhat[i] != ' ')
124 break;
125 if (i == len) {
126 findwhat_array.push_back(findwhat);
127 return findwhat_array;
128 }
129
130 int index = 0;
131 while (true) {
132 absl::optional<WideString> word = ExtractSubString(findwhat.c_str(), index);
133 if (!word.has_value())
134 break;
135
136 if (word->IsEmpty()) {
137 findwhat_array.push_back(L"");
138 index++;
139 continue;
140 }
141
142 size_t pos = 0;
143 while (pos < word->GetLength()) {
144 WideString curStr = word->Substr(pos, 1);
145 wchar_t curChar = word.value()[pos];
146 if (IsIgnoreSpaceCharacter(curChar)) {
147 if (pos > 0 && curChar == pdfium::unicode::kRightSingleQuotationMark) {
148 pos++;
149 continue;
150 }
151 if (pos > 0)
152 findwhat_array.push_back(word->First(pos));
153 findwhat_array.push_back(curStr);
154 if (pos == word->GetLength() - 1) {
155 word->clear();
156 break;
157 }
158 word.emplace(word->Last(word->GetLength() - pos - 1));
159 pos = 0;
160 continue;
161 }
162 pos++;
163 }
164
165 if (!word->IsEmpty())
166 findwhat_array.push_back(word.value());
167 index++;
168 }
169 return findwhat_array;
170 }
171
172 } // namespace
173
174 // static
Create(const CPDF_TextPage * pTextPage,const WideString & findwhat,const Options & options,absl::optional<size_t> startPos)175 std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create(
176 const CPDF_TextPage* pTextPage,
177 const WideString& findwhat,
178 const Options& options,
179 absl::optional<size_t> startPos) {
180 std::vector<WideString> findwhat_array =
181 ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase));
182 auto find = pdfium::WrapUnique(
183 new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos));
184 find->FindFirst();
185 return find;
186 }
187
CPDF_TextPageFind(const CPDF_TextPage * pTextPage,const std::vector<WideString> & findwhat_array,const Options & options,absl::optional<size_t> startPos)188 CPDF_TextPageFind::CPDF_TextPageFind(
189 const CPDF_TextPage* pTextPage,
190 const std::vector<WideString>& findwhat_array,
191 const Options& options,
192 absl::optional<size_t> startPos)
193 : m_pTextPage(pTextPage),
194 m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)),
195 m_csFindWhatArray(findwhat_array),
196 m_options(options) {
197 if (!m_strText.IsEmpty()) {
198 m_findNextStart = startPos;
199 m_findPreStart = startPos.value_or(m_strText.GetLength() - 1);
200 }
201 }
202
203 CPDF_TextPageFind::~CPDF_TextPageFind() = default;
204
GetCharIndex(int index) const205 int CPDF_TextPageFind::GetCharIndex(int index) const {
206 return m_pTextPage->CharIndexFromTextIndex(index);
207 }
208
FindFirst()209 bool CPDF_TextPageFind::FindFirst() {
210 return m_strText.IsEmpty() || !m_csFindWhatArray.empty();
211 }
212
FindNext()213 bool CPDF_TextPageFind::FindNext() {
214 if (m_strText.IsEmpty() || !m_findNextStart.has_value())
215 return false;
216
217 size_t strLen = m_strText.GetLength();
218 if (m_findNextStart.value() > strLen - 1)
219 return false;
220
221 int nCount = fxcrt::CollectionSize<int>(m_csFindWhatArray);
222 absl::optional<size_t> nResultPos = 0;
223 size_t nStartPos = m_findNextStart.value();
224 bool bSpaceStart = false;
225 for (int iWord = 0; iWord < nCount; iWord++) {
226 WideString csWord = m_csFindWhatArray[iWord];
227 if (csWord.IsEmpty()) {
228 if (iWord == nCount - 1) {
229 wchar_t strInsert = m_strText[nStartPos];
230 if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' ||
231 strInsert == kNonBreakingSpace) {
232 nResultPos = nStartPos + 1;
233 break;
234 }
235 iWord = -1;
236 } else if (iWord == 0) {
237 bSpaceStart = true;
238 }
239 continue;
240 }
241 nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos);
242 if (!nResultPos.has_value())
243 return false;
244
245 size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
246 if (iWord == 0)
247 m_resStart = nResultPos.value();
248 bool bMatch = true;
249 if (iWord != 0 && !bSpaceStart) {
250 size_t PreResEndPos = nStartPos;
251 int curChar = csWord[0];
252 WideString lastWord = m_csFindWhatArray[iWord - 1];
253 int lastChar = lastWord.Back();
254 if (nStartPos == nResultPos.value() &&
255 !(IsIgnoreSpaceCharacter(lastChar) ||
256 IsIgnoreSpaceCharacter(curChar))) {
257 bMatch = false;
258 }
259 for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
260 wchar_t strInsert = m_strText[d];
261 if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
262 strInsert != kNonBreakingSpace) {
263 bMatch = false;
264 break;
265 }
266 }
267 } else if (bSpaceStart) {
268 if (nResultPos.value() > 0) {
269 wchar_t strInsert = m_strText[nResultPos.value() - 1];
270 if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
271 strInsert != kNonBreakingSpace) {
272 bMatch = false;
273 m_resStart = nResultPos.value();
274 } else {
275 m_resStart = nResultPos.value() - 1;
276 }
277 }
278 }
279 if (m_options.bMatchWholeWord && bMatch)
280 bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
281
282 nStartPos = endIndex + 1;
283 if (!bMatch) {
284 iWord = -1;
285 size_t index = bSpaceStart ? 1 : 0;
286 nStartPos = m_resStart + m_csFindWhatArray[index].GetLength();
287 }
288 }
289 m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
290 if (m_options.bConsecutive) {
291 m_findNextStart = m_resStart + 1;
292 m_findPreStart = m_resEnd - 1;
293 } else {
294 m_findNextStart = m_resEnd + 1;
295 m_findPreStart = m_resStart - 1;
296 }
297 return true;
298 }
299
FindPrev()300 bool CPDF_TextPageFind::FindPrev() {
301 if (m_strText.IsEmpty() || !m_findPreStart.has_value())
302 return false;
303
304 CPDF_TextPageFind find_engine(m_pTextPage, m_csFindWhatArray, m_options, 0);
305 if (!find_engine.FindFirst())
306 return false;
307
308 int order = -1;
309 int matches = 0;
310 while (find_engine.FindNext()) {
311 int cur_order = find_engine.GetCurOrder();
312 int cur_match = find_engine.GetMatchedCount();
313 int temp = cur_order + cur_match;
314 if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
315 break;
316
317 order = cur_order;
318 matches = cur_match;
319 }
320 if (order == -1)
321 return false;
322
323 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
324 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1);
325 if (m_options.bConsecutive) {
326 m_findNextStart = m_resStart + 1;
327 m_findPreStart = m_resEnd - 1;
328 } else {
329 m_findNextStart = m_resEnd + 1;
330 m_findPreStart = m_resStart - 1;
331 }
332 return true;
333 }
334
GetCurOrder() const335 int CPDF_TextPageFind::GetCurOrder() const {
336 return GetCharIndex(m_resStart);
337 }
338
GetMatchedCount() const339 int CPDF_TextPageFind::GetMatchedCount() const {
340 int resStart = GetCharIndex(m_resStart);
341 int resEnd = GetCharIndex(m_resEnd);
342 return resEnd - resStart + 1;
343 }
344