xref: /aosp_15_r20/external/pdfium/core/fpdftext/cpdf_linkextract.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_linkextract.h"
8 
9 #include <vector>
10 
11 #include "core/fpdftext/cpdf_textpage.h"
12 #include "core/fxcrt/fx_extension.h"
13 #include "core/fxcrt/fx_string.h"
14 #include "core/fxcrt/fx_system.h"
15 
16 namespace {
17 
18 // Find the end of a web link starting from offset |start| and ending at offset
19 // |end|. The purpose of this function is to separate url from the surrounding
20 // context characters, we do not intend to fully validate the url. |str|
21 // contains lower case characters only.
FindWebLinkEnding(const WideString & str,size_t start,size_t end)22 size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
23   if (str.Contains(L'/', start)) {
24     // When there is a path and query after '/', most ASCII chars are allowed.
25     // We don't sanitize in this case.
26     return end;
27   }
28 
29   // When there is no path, it only has IP address or host name.
30   // Port is optional at the end.
31   if (str[start] == L'[') {
32     // IPv6 reference.
33     // Find the end of the reference.
34     auto result = str.Find(L']', start + 1);
35     if (result.has_value()) {
36       end = result.value();
37       if (end > start + 1) {  // Has content inside brackets.
38         size_t len = str.GetLength();
39         size_t off = end + 1;
40         if (off < len && str[off] == L':') {
41           off++;
42           while (off < len && FXSYS_IsDecimalDigit(str[off]))
43             off++;
44           if (off > end + 2 &&
45               off <= len)   // At least one digit in port number.
46             end = off - 1;  // |off| is offset of the first invalid char.
47         }
48       }
49     }
50     return end;
51   }
52 
53   // According to RFC1123, host name only has alphanumeric chars, hyphens,
54   // and periods. Hyphen should not at the end though.
55   // Non-ASCII chars are ignored during checking.
56   while (end > start && str[end] < 0x80) {
57     if (FXSYS_IsDecimalDigit(str[end]) ||
58         (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') {
59       break;
60     }
61     end--;
62   }
63   return end;
64 }
65 
66 // Remove characters from the end of |str|, delimited by |start| and |end|, up
67 // to and including |charToFind|. No-op if |charToFind| is not present. Updates
68 // |end| if characters were removed.
TrimBackwardsToChar(const WideString & str,wchar_t charToFind,size_t start,size_t * end)69 void TrimBackwardsToChar(const WideString& str,
70                          wchar_t charToFind,
71                          size_t start,
72                          size_t* end) {
73   for (size_t pos = *end; pos >= start; pos--) {
74     if (str[pos] == charToFind) {
75       *end = pos - 1;
76       break;
77     }
78   }
79 }
80 
81 // Finds opening brackets ()[]{}<> and quotes "'  before the URL delimited by
82 // |start| and |end| in |str|. Matches a closing bracket or quote for each
83 // opening character and, if present, removes everything afterwards. Returns the
84 // new end position for the string.
TrimExternalBracketsFromWebLink(const WideString & str,size_t start,size_t end)85 size_t TrimExternalBracketsFromWebLink(const WideString& str,
86                                        size_t start,
87                                        size_t end) {
88   for (size_t pos = 0; pos < start; pos++) {
89     if (str[pos] == '(') {
90       TrimBackwardsToChar(str, ')', start, &end);
91     } else if (str[pos] == '[') {
92       TrimBackwardsToChar(str, ']', start, &end);
93     } else if (str[pos] == '{') {
94       TrimBackwardsToChar(str, '}', start, &end);
95     } else if (str[pos] == '<') {
96       TrimBackwardsToChar(str, '>', start, &end);
97     } else if (str[pos] == '"') {
98       TrimBackwardsToChar(str, '"', start, &end);
99     } else if (str[pos] == '\'') {
100       TrimBackwardsToChar(str, '\'', start, &end);
101     }
102   }
103   return end;
104 }
105 
106 }  // namespace
107 
CPDF_LinkExtract(const CPDF_TextPage * pTextPage)108 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
109     : m_pTextPage(pTextPage) {}
110 
111 CPDF_LinkExtract::~CPDF_LinkExtract() = default;
112 
ExtractLinks()113 void CPDF_LinkExtract::ExtractLinks() {
114   m_LinkArray.clear();
115   size_t start = 0;
116   size_t pos = 0;
117   bool bAfterHyphen = false;
118   bool bLineBreak = false;
119   const size_t nTotalChar = m_pTextPage->CountChars();
120   const WideString page_text = m_pTextPage->GetAllPageText();
121   while (pos < nTotalChar) {
122     const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
123     if (char_info.m_CharType != CPDF_TextPage::CharType::kGenerated &&
124         char_info.m_Unicode != L' ' && pos != nTotalChar - 1) {
125       bAfterHyphen =
126           (char_info.m_CharType == CPDF_TextPage::CharType::kHyphen ||
127            (char_info.m_CharType == CPDF_TextPage::CharType::kNormal &&
128             char_info.m_Unicode == L'-'));
129       ++pos;
130       continue;
131     }
132 
133     size_t nCount = pos - start;
134     if (pos == nTotalChar - 1) {
135       ++nCount;
136     } else if (bAfterHyphen &&
137                (char_info.m_Unicode == L'\n' || char_info.m_Unicode == L'\r')) {
138       // Handle text breaks with a hyphen to the next line.
139       bLineBreak = true;
140       ++pos;
141       continue;
142     }
143 
144     WideString strBeCheck = page_text.Substr(start, nCount);
145     if (bLineBreak) {
146       strBeCheck.Remove(L'\n');
147       strBeCheck.Remove(L'\r');
148       bLineBreak = false;
149     }
150     // Replace the generated code with the hyphen char.
151     strBeCheck.Replace(L"\xfffe", L"-");
152 
153     if (strBeCheck.GetLength() > 5) {
154       while (strBeCheck.GetLength() > 0) {
155         wchar_t ch = strBeCheck.Back();
156         if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.')
157           break;
158 
159         strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1);
160         nCount--;
161       }
162 
163       // Check for potential web URLs and email addresses.
164       // Ftp address, file system links, data, blob etc. are not checked.
165       if (nCount > 5) {
166         auto maybe_link = CheckWebLink(strBeCheck);
167         if (maybe_link.has_value()) {
168           maybe_link.value().m_Start += start;
169           m_LinkArray.push_back(maybe_link.value());
170         } else if (CheckMailLink(&strBeCheck)) {
171           m_LinkArray.push_back(Link{{start, nCount}, strBeCheck});
172         }
173       }
174     }
175     start = ++pos;
176   }
177 }
178 
CheckWebLink(const WideString & strBeCheck)179 absl::optional<CPDF_LinkExtract::Link> CPDF_LinkExtract::CheckWebLink(
180     const WideString& strBeCheck) {
181   static const wchar_t kHttpScheme[] = L"http";
182   static const wchar_t kWWWAddrStart[] = L"www.";
183 
184   const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
185   const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
186 
187   WideString str = strBeCheck;
188   str.MakeLower();
189 
190   // First, try to find the scheme.
191   auto start = str.Find(kHttpScheme);
192   if (start.has_value()) {
193     size_t off = start.value() + kHttpSchemeLen;  // move after "http".
194     if (str.GetLength() > off + 4) {  // At least "://<char>" follows.
195       if (str[off] == L's')  // "https" scheme is accepted.
196         off++;
197       if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
198         off += 3;
199         const size_t end =
200             FindWebLinkEnding(str, off,
201                               TrimExternalBracketsFromWebLink(
202                                   str, start.value(), str.GetLength() - 1));
203         if (end > off) {  // Non-empty host name.
204           const size_t nStart = start.value();
205           const size_t nCount = end - nStart + 1;
206           return Link{{nStart, nCount}, strBeCheck.Substr(nStart, nCount)};
207         }
208       }
209     }
210   }
211 
212   // When there is no scheme, try to find url starting with "www.".
213   start = str.Find(kWWWAddrStart);
214   if (start.has_value()) {
215     size_t off = start.value() + kWWWAddrStartLen;
216     if (str.GetLength() > off) {
217       const size_t end =
218           FindWebLinkEnding(str, start.value(),
219                             TrimExternalBracketsFromWebLink(
220                                 str, start.value(), str.GetLength() - 1));
221       if (end > off) {
222         const size_t nStart = start.value();
223         const size_t nCount = end - nStart + 1;
224         return Link{{nStart, nCount},
225                     L"http://" + strBeCheck.Substr(nStart, nCount)};
226       }
227     }
228   }
229 
230   return absl::nullopt;
231 }
232 
CheckMailLink(WideString * str)233 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
234   auto aPos = str->Find(L'@');
235   // Invalid when no '@' or when starts/ends with '@'.
236   if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
237     return false;
238 
239   // Check the local part.
240   size_t pPos = aPos.value();  // Used to track the position of '@' or '.'.
241   for (size_t i = aPos.value(); i > 0; i--) {
242     wchar_t ch = (*str)[i - 1];
243     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
244       continue;
245 
246     if (ch != L'.' || i == pPos || i == 1) {
247       if (i == aPos.value()) {
248         // There is '.' or invalid char before '@'.
249         return false;
250       }
251       // End extracting for other invalid chars, '.' at the beginning, or
252       // consecutive '.'.
253       size_t removed_len = i == pPos ? i + 1 : i;
254       *str = str->Last(str->GetLength() - removed_len);
255       break;
256     }
257     // Found a valid '.'.
258     pPos = i - 1;
259   }
260 
261   // Check the domain name part.
262   aPos = str->Find(L'@');
263   if (!aPos.has_value() || aPos.value() == 0)
264     return false;
265 
266   str->TrimRight(L'.');
267   // At least one '.' in domain name, but not at the beginning.
268   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
269   // Check whether we should remove this check.
270   auto ePos = str->Find(L'.', aPos.value() + 1);
271   if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
272     return false;
273 
274   // Validate all other chars in domain name.
275   size_t nLen = str->GetLength();
276   pPos = 0;  // Used to track the position of '.'.
277   for (size_t i = aPos.value() + 1; i < nLen; i++) {
278     wchar_t wch = (*str)[i];
279     if (wch == L'-' || FXSYS_iswalnum(wch))
280       continue;
281 
282     if (wch != L'.' || i == pPos + 1) {
283       // Domain name should end before invalid char.
284       size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
285       if (pPos > 0 && host_end - aPos.value() >= 3) {
286         // Trim the ending invalid chars if there is at least one '.' and name.
287         *str = str->First(host_end + 1);
288         break;
289       }
290       return false;
291     }
292     pPos = i;
293   }
294 
295   if (!str->Contains(L"mailto:"))
296     *str = L"mailto:" + *str;
297 
298   return true;
299 }
300 
GetURL(size_t index) const301 WideString CPDF_LinkExtract::GetURL(size_t index) const {
302   return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl
303                                     : WideString();
304 }
305 
GetRects(size_t index) const306 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
307   if (index >= m_LinkArray.size())
308     return std::vector<CFX_FloatRect>();
309 
310   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
311                                    m_LinkArray[index].m_Count);
312 }
313 
GetTextRange(size_t index) const314 absl::optional<CPDF_LinkExtract::Range> CPDF_LinkExtract::GetTextRange(
315     size_t index) const {
316   if (index >= m_LinkArray.size())
317     return absl::nullopt;
318   return m_LinkArray[index];
319 }
320