xref: /aosp_15_r20/external/pdfium/core/fpdftext/cpdf_textpagefind.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_textpagefind.h"
8 
9 #include <wchar.h>
10 
11 #include <vector>
12 
13 #include "core/fpdftext/cpdf_textpage.h"
14 #include "core/fxcrt/fx_extension.h"
15 #include "core/fxcrt/fx_string.h"
16 #include "core/fxcrt/fx_system.h"
17 #include "core/fxcrt/fx_unicode.h"
18 #include "core/fxcrt/stl_util.h"
19 #include "third_party/base/check.h"
20 #include "third_party/base/memory/ptr_util.h"
21 
22 namespace {
23 
24 constexpr wchar_t kNonBreakingSpace = 160;
25 
IsIgnoreSpaceCharacter(wchar_t curChar)26 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
27   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
28       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
29       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
30       (curChar >= 0x0400 && curChar <= 0x04FF) ||
31       (curChar >= 0x0500 && curChar <= 0x052F) ||
32       (curChar >= 0xA640 && curChar <= 0xA69F) ||
33       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
34       (curChar >= 0x2000 && curChar <= 0x206F)) {
35     return false;
36   }
37   return true;
38 }
39 
IsMatchWholeWord(const WideString & csPageText,size_t startPos,size_t endPos)40 bool IsMatchWholeWord(const WideString& csPageText,
41                       size_t startPos,
42                       size_t endPos) {
43   if (startPos > endPos)
44     return false;
45   wchar_t char_left = 0;
46   wchar_t char_right = 0;
47   size_t char_count = endPos - startPos + 1;
48   if (char_count == 0)
49     return false;
50   if (char_count == 1 && csPageText[startPos] > 255)
51     return true;
52   if (startPos >= 1)
53     char_left = csPageText[startPos - 1];
54   if (startPos + char_count < csPageText.GetLength())
55     char_right = csPageText[startPos + char_count];
56   if ((char_left > 'A' && char_left < 'a') ||
57       (char_left > 'a' && char_left < 'z') ||
58       (char_left > 0xfb00 && char_left < 0xfb06) ||
59       FXSYS_IsDecimalDigit(char_left) ||
60       (char_right > 'A' && char_right < 'a') ||
61       (char_right > 'a' && char_right < 'z') ||
62       (char_right > 0xfb00 && char_right < 0xfb06) ||
63       FXSYS_IsDecimalDigit(char_right)) {
64     return false;
65   }
66   if (!(('A' > char_left || char_left > 'Z') &&
67         ('a' > char_left || char_left > 'z') &&
68         ('A' > char_right || char_right > 'Z') &&
69         ('a' > char_right || char_right > 'z'))) {
70     return false;
71   }
72   if (char_count > 0) {
73     if (FXSYS_IsDecimalDigit(char_left) &&
74         FXSYS_IsDecimalDigit(csPageText[startPos])) {
75       return false;
76     }
77     if (FXSYS_IsDecimalDigit(char_right) &&
78         FXSYS_IsDecimalDigit(csPageText[endPos])) {
79       return false;
80     }
81   }
82   return true;
83 }
84 
GetStringCase(const WideString & wsOriginal,bool bMatchCase)85 WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) {
86   if (bMatchCase)
87     return wsOriginal;
88 
89   WideString wsLower = wsOriginal;
90   wsLower.MakeLower();
91   return wsLower;
92 }
93 
ExtractSubString(const wchar_t * lpszFullString,int iSubString)94 absl::optional<WideString> ExtractSubString(const wchar_t* lpszFullString,
95                                             int iSubString) {
96   DCHECK(lpszFullString);
97 
98   while (iSubString--) {
99     lpszFullString = wcschr(lpszFullString, L' ');
100     if (!lpszFullString)
101       return absl::nullopt;
102 
103     lpszFullString++;
104     while (*lpszFullString == L' ')
105       lpszFullString++;
106   }
107 
108   const wchar_t* lpchEnd = wcschr(lpszFullString, L' ');
109   int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
110                      : static_cast<int>(wcslen(lpszFullString));
111   if (nLen < 0)
112     return absl::nullopt;
113 
114   return WideString(lpszFullString, static_cast<size_t>(nLen));
115 }
116 
ExtractFindWhat(const WideString & findwhat)117 std::vector<WideString> ExtractFindWhat(const WideString& findwhat) {
118   std::vector<WideString> findwhat_array;
119 
120   size_t len = findwhat.GetLength();
121   size_t i = 0;
122   for (i = 0; i < len; ++i)
123     if (findwhat[i] != ' ')
124       break;
125   if (i == len) {
126     findwhat_array.push_back(findwhat);
127     return findwhat_array;
128   }
129 
130   int index = 0;
131   while (true) {
132     absl::optional<WideString> word = ExtractSubString(findwhat.c_str(), index);
133     if (!word.has_value())
134       break;
135 
136     if (word->IsEmpty()) {
137       findwhat_array.push_back(L"");
138       index++;
139       continue;
140     }
141 
142     size_t pos = 0;
143     while (pos < word->GetLength()) {
144       WideString curStr = word->Substr(pos, 1);
145       wchar_t curChar = word.value()[pos];
146       if (IsIgnoreSpaceCharacter(curChar)) {
147         if (pos > 0 && curChar == pdfium::unicode::kRightSingleQuotationMark) {
148           pos++;
149           continue;
150         }
151         if (pos > 0)
152           findwhat_array.push_back(word->First(pos));
153         findwhat_array.push_back(curStr);
154         if (pos == word->GetLength() - 1) {
155           word->clear();
156           break;
157         }
158         word.emplace(word->Last(word->GetLength() - pos - 1));
159         pos = 0;
160         continue;
161       }
162       pos++;
163     }
164 
165     if (!word->IsEmpty())
166       findwhat_array.push_back(word.value());
167     index++;
168   }
169   return findwhat_array;
170 }
171 
172 }  // namespace
173 
174 // static
Create(const CPDF_TextPage * pTextPage,const WideString & findwhat,const Options & options,absl::optional<size_t> startPos)175 std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create(
176     const CPDF_TextPage* pTextPage,
177     const WideString& findwhat,
178     const Options& options,
179     absl::optional<size_t> startPos) {
180   std::vector<WideString> findwhat_array =
181       ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase));
182   auto find = pdfium::WrapUnique(
183       new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos));
184   find->FindFirst();
185   return find;
186 }
187 
CPDF_TextPageFind(const CPDF_TextPage * pTextPage,const std::vector<WideString> & findwhat_array,const Options & options,absl::optional<size_t> startPos)188 CPDF_TextPageFind::CPDF_TextPageFind(
189     const CPDF_TextPage* pTextPage,
190     const std::vector<WideString>& findwhat_array,
191     const Options& options,
192     absl::optional<size_t> startPos)
193     : m_pTextPage(pTextPage),
194       m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)),
195       m_csFindWhatArray(findwhat_array),
196       m_options(options) {
197   if (!m_strText.IsEmpty()) {
198     m_findNextStart = startPos;
199     m_findPreStart = startPos.value_or(m_strText.GetLength() - 1);
200   }
201 }
202 
203 CPDF_TextPageFind::~CPDF_TextPageFind() = default;
204 
GetCharIndex(int index) const205 int CPDF_TextPageFind::GetCharIndex(int index) const {
206   return m_pTextPage->CharIndexFromTextIndex(index);
207 }
208 
FindFirst()209 bool CPDF_TextPageFind::FindFirst() {
210   return m_strText.IsEmpty() || !m_csFindWhatArray.empty();
211 }
212 
FindNext()213 bool CPDF_TextPageFind::FindNext() {
214   if (m_strText.IsEmpty() || !m_findNextStart.has_value())
215     return false;
216 
217   size_t strLen = m_strText.GetLength();
218   if (m_findNextStart.value() > strLen - 1)
219     return false;
220 
221   int nCount = fxcrt::CollectionSize<int>(m_csFindWhatArray);
222   absl::optional<size_t> nResultPos = 0;
223   size_t nStartPos = m_findNextStart.value();
224   bool bSpaceStart = false;
225   for (int iWord = 0; iWord < nCount; iWord++) {
226     WideString csWord = m_csFindWhatArray[iWord];
227     if (csWord.IsEmpty()) {
228       if (iWord == nCount - 1) {
229         wchar_t strInsert = m_strText[nStartPos];
230         if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' ||
231             strInsert == kNonBreakingSpace) {
232           nResultPos = nStartPos + 1;
233           break;
234         }
235         iWord = -1;
236       } else if (iWord == 0) {
237         bSpaceStart = true;
238       }
239       continue;
240     }
241     nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos);
242     if (!nResultPos.has_value())
243       return false;
244 
245     size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
246     if (iWord == 0)
247       m_resStart = nResultPos.value();
248     bool bMatch = true;
249     if (iWord != 0 && !bSpaceStart) {
250       size_t PreResEndPos = nStartPos;
251       int curChar = csWord[0];
252       WideString lastWord = m_csFindWhatArray[iWord - 1];
253       int lastChar = lastWord.Back();
254       if (nStartPos == nResultPos.value() &&
255           !(IsIgnoreSpaceCharacter(lastChar) ||
256             IsIgnoreSpaceCharacter(curChar))) {
257         bMatch = false;
258       }
259       for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
260         wchar_t strInsert = m_strText[d];
261         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
262             strInsert != kNonBreakingSpace) {
263           bMatch = false;
264           break;
265         }
266       }
267     } else if (bSpaceStart) {
268       if (nResultPos.value() > 0) {
269         wchar_t strInsert = m_strText[nResultPos.value() - 1];
270         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' &&
271             strInsert != kNonBreakingSpace) {
272           bMatch = false;
273           m_resStart = nResultPos.value();
274         } else {
275           m_resStart = nResultPos.value() - 1;
276         }
277       }
278     }
279     if (m_options.bMatchWholeWord && bMatch)
280       bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
281 
282     nStartPos = endIndex + 1;
283     if (!bMatch) {
284       iWord = -1;
285       size_t index = bSpaceStart ? 1 : 0;
286       nStartPos = m_resStart + m_csFindWhatArray[index].GetLength();
287     }
288   }
289   m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
290   if (m_options.bConsecutive) {
291     m_findNextStart = m_resStart + 1;
292     m_findPreStart = m_resEnd - 1;
293   } else {
294     m_findNextStart = m_resEnd + 1;
295     m_findPreStart = m_resStart - 1;
296   }
297   return true;
298 }
299 
FindPrev()300 bool CPDF_TextPageFind::FindPrev() {
301   if (m_strText.IsEmpty() || !m_findPreStart.has_value())
302     return false;
303 
304   CPDF_TextPageFind find_engine(m_pTextPage, m_csFindWhatArray, m_options, 0);
305   if (!find_engine.FindFirst())
306     return false;
307 
308   int order = -1;
309   int matches = 0;
310   while (find_engine.FindNext()) {
311     int cur_order = find_engine.GetCurOrder();
312     int cur_match = find_engine.GetMatchedCount();
313     int temp = cur_order + cur_match;
314     if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
315       break;
316 
317     order = cur_order;
318     matches = cur_match;
319   }
320   if (order == -1)
321     return false;
322 
323   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
324   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1);
325   if (m_options.bConsecutive) {
326     m_findNextStart = m_resStart + 1;
327     m_findPreStart = m_resEnd - 1;
328   } else {
329     m_findNextStart = m_resEnd + 1;
330     m_findPreStart = m_resStart - 1;
331   }
332   return true;
333 }
334 
GetCurOrder() const335 int CPDF_TextPageFind::GetCurOrder() const {
336   return GetCharIndex(m_resStart);
337 }
338 
GetMatchedCount() const339 int CPDF_TextPageFind::GetMatchedCount() const {
340   int resStart = GetCharIndex(m_resStart);
341   int resEnd = GetCharIndex(m_resEnd);
342   return resEnd - resStart + 1;
343 }
344