xref: /aosp_15_r20/external/pdfium/core/fpdftext/cpdf_textpage.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2014 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdftext/cpdf_textpage.h"
8 
9 #include <math.h>
10 #include <stdint.h>
11 
12 #include <algorithm>
13 #include <utility>
14 #include <vector>
15 
16 #include "core/fpdfapi/font/cpdf_cidfont.h"
17 #include "core/fpdfapi/font/cpdf_font.h"
18 #include "core/fpdfapi/page/cpdf_form.h"
19 #include "core/fpdfapi/page/cpdf_formobject.h"
20 #include "core/fpdfapi/page/cpdf_page.h"
21 #include "core/fpdfapi/page/cpdf_pageobject.h"
22 #include "core/fpdfapi/page/cpdf_textobject.h"
23 #include "core/fpdfapi/parser/cpdf_dictionary.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdftext/unicodenormalizationdata.h"
26 #include "core/fxcrt/data_vector.h"
27 #include "core/fxcrt/fx_bidi.h"
28 #include "core/fxcrt/fx_extension.h"
29 #include "core/fxcrt/fx_unicode.h"
30 #include "core/fxcrt/stl_util.h"
31 #include "third_party/base/check.h"
32 #include "third_party/base/check_op.h"
33 
34 namespace {
35 
36 constexpr float kDefaultFontSize = 1.0f;
37 constexpr float kSizeEpsilon = 0.01f;
38 
39 const uint16_t* const kUnicodeDataNormalizationMaps[] = {
40     kUnicodeDataNormalizationMap2, kUnicodeDataNormalizationMap3,
41     kUnicodeDataNormalizationMap4};
42 
NormalizeThreshold(float threshold,int t1,int t2,int t3)43 float NormalizeThreshold(float threshold, int t1, int t2, int t3) {
44   DCHECK(t1 < t2);
45   DCHECK(t2 < t3);
46   if (threshold < t1)
47     return threshold / 2.0f;
48   if (threshold < t2)
49     return threshold / 4.0f;
50   if (threshold < t3)
51     return threshold / 5.0f;
52   return threshold / 6.0f;
53 }
54 
CalculateBaseSpace(const CPDF_TextObject * pTextObj,const CFX_Matrix & matrix)55 float CalculateBaseSpace(const CPDF_TextObject* pTextObj,
56                          const CFX_Matrix& matrix) {
57   const size_t nItems = pTextObj->CountItems();
58   if (!pTextObj->m_TextState.GetCharSpace() || nItems < 3)
59     return 0.0f;
60 
61   bool bAllChar = true;
62   float spacing =
63       matrix.TransformDistance(pTextObj->m_TextState.GetCharSpace());
64   float baseSpace = spacing;
65   for (size_t i = 0; i < nItems; ++i) {
66     CPDF_TextObject::Item item = pTextObj->GetItemInfo(i);
67     if (item.m_CharCode == 0xffffffff) {
68       float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
69       float kerning = -fontsize_h * item.m_Origin.x / 1000;
70       baseSpace = std::min(baseSpace, kerning + spacing);
71       bAllChar = false;
72     }
73   }
74   if (baseSpace < 0.0 || (nItems == 3 && !bAllChar))
75     return 0.0f;
76 
77   return baseSpace;
78 }
79 
GetUnicodeNormalization(wchar_t wch)80 DataVector<wchar_t> GetUnicodeNormalization(wchar_t wch) {
81   wch = wch & 0xFFFF;
82   wchar_t wFind = kUnicodeDataNormalization[wch];
83   if (!wFind)
84     return DataVector<wchar_t>(1, wch);
85 
86   if (wFind >= 0x8000) {
87     return DataVector<wchar_t>(1,
88                                kUnicodeDataNormalizationMap1[wFind - 0x8000]);
89   }
90 
91   wch = wFind & 0x0FFF;
92   wFind >>= 12;
93   const uint16_t* pMap = kUnicodeDataNormalizationMaps[wFind - 2] + wch;
94   if (wFind == 4)
95     wFind = static_cast<wchar_t>(*pMap++);
96 
97   return DataVector<wchar_t>(pMap, pMap + wFind);
98 }
99 
MaskPercentFilled(const std::vector<bool> & mask,int32_t start,int32_t end)100 float MaskPercentFilled(const std::vector<bool>& mask,
101                         int32_t start,
102                         int32_t end) {
103   if (start >= end)
104     return 0;
105   float count = std::count_if(mask.begin() + start, mask.begin() + end,
106                               [](bool r) { return r; });
107   return count / (end - start);
108 }
109 
IsControlChar(const CPDF_TextPage::CharInfo & char_info)110 bool IsControlChar(const CPDF_TextPage::CharInfo& char_info) {
111   switch (char_info.m_Unicode) {
112     case 0x2:
113     case 0x3:
114     case 0x93:
115     case 0x94:
116     case 0x96:
117     case 0x97:
118     case 0x98:
119     case 0xfffe:
120       return char_info.m_CharType != CPDF_TextPage::CharType::kHyphen;
121     default:
122       return false;
123   }
124 }
125 
IsHyphenCode(wchar_t c)126 bool IsHyphenCode(wchar_t c) {
127   return c == 0x2D || c == 0xAD;
128 }
129 
IsRectIntersect(const CFX_FloatRect & rect1,const CFX_FloatRect & rect2)130 bool IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2) {
131   CFX_FloatRect rect = rect1;
132   rect.Intersect(rect2);
133   return !rect.IsEmpty();
134 }
135 
IsRightToLeft(const CPDF_TextObject & text_obj,const CPDF_Font & font)136 bool IsRightToLeft(const CPDF_TextObject& text_obj, const CPDF_Font& font) {
137   const size_t nItems = text_obj.CountItems();
138   WideString str;
139   str.Reserve(nItems);
140   for (size_t i = 0; i < nItems; ++i) {
141     CPDF_TextObject::Item item = text_obj.GetItemInfo(i);
142     if (item.m_CharCode == 0xffffffff)
143       continue;
144     WideString wstrItem = font.UnicodeFromCharCode(item.m_CharCode);
145     wchar_t wChar = !wstrItem.IsEmpty() ? wstrItem[0] : 0;
146     if (wChar == 0)
147       wChar = item.m_CharCode;
148     if (wChar)
149       str += wChar;
150   }
151   return CFX_BidiString(str).OverallDirection() ==
152          CFX_BidiChar::Direction::kRight;
153 }
154 
GetCharWidth(uint32_t charCode,CPDF_Font * pFont)155 int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) {
156   if (charCode == CPDF_Font::kInvalidCharCode)
157     return 0;
158 
159   int w = pFont->GetCharWidthF(charCode);
160   if (w > 0)
161     return w;
162 
163   ByteString str;
164   pFont->AppendChar(&str, charCode);
165   w = pFont->GetStringWidth(str.AsStringView());
166   if (w > 0)
167     return w;
168 
169   FX_RECT rect = pFont->GetCharBBox(charCode);
170   if (!rect.Valid())
171     return 0;
172 
173   return std::max(rect.Width(), 0);
174 }
175 
GenerateSpace(const CFX_PointF & pos,float last_pos,float this_width,float last_width,float threshold)176 bool GenerateSpace(const CFX_PointF& pos,
177                    float last_pos,
178                    float this_width,
179                    float last_width,
180                    float threshold) {
181   if (fabs(last_pos + last_width - pos.x) <= threshold)
182     return false;
183 
184   float threshold_pos = threshold + last_width;
185   float pos_difference = pos.x - last_pos;
186   if (fabs(pos_difference) > threshold_pos)
187     return true;
188   if (pos.x < 0 && -threshold_pos > pos_difference)
189     return true;
190   return pos_difference > this_width + last_width;
191 }
192 
EndHorizontalLine(const CFX_FloatRect & this_rect,const CFX_FloatRect & prev_rect)193 bool EndHorizontalLine(const CFX_FloatRect& this_rect,
194                        const CFX_FloatRect& prev_rect) {
195   if (this_rect.Height() <= 4.5 || prev_rect.Height() <= 4.5)
196     return false;
197 
198   float top = std::min(this_rect.top, prev_rect.top);
199   float bottom = std::max(this_rect.bottom, prev_rect.bottom);
200   return bottom >= top;
201 }
202 
EndVerticalLine(const CFX_FloatRect & this_rect,const CFX_FloatRect & prev_rect,const CFX_FloatRect & curline_rect,float this_fontsize,float prev_fontsize)203 bool EndVerticalLine(const CFX_FloatRect& this_rect,
204                      const CFX_FloatRect& prev_rect,
205                      const CFX_FloatRect& curline_rect,
206                      float this_fontsize,
207                      float prev_fontsize) {
208   if (this_rect.Width() <= this_fontsize * 0.1f ||
209       prev_rect.Width() <= prev_fontsize * 0.1f) {
210     return false;
211   }
212 
213   float left = std::max(this_rect.left, curline_rect.left);
214   float right = std::min(this_rect.right, curline_rect.right);
215   return right <= left;
216 }
217 
GetPageMatrix(const CPDF_Page * pPage)218 CFX_Matrix GetPageMatrix(const CPDF_Page* pPage) {
219   const FX_RECT rect(0, 0, static_cast<int>(pPage->GetPageWidth()),
220                      static_cast<int>(pPage->GetPageHeight()));
221   return pPage->GetDisplayMatrix(rect, 0);
222 }
223 
GetFontSize(const CPDF_TextObject * text_object)224 float GetFontSize(const CPDF_TextObject* text_object) {
225   bool has_font = text_object && text_object->GetFont();
226   return has_font ? text_object->GetFontSize() : kDefaultFontSize;
227 }
228 
GetLooseBounds(const CPDF_TextPage::CharInfo & charinfo)229 CFX_FloatRect GetLooseBounds(const CPDF_TextPage::CharInfo& charinfo) {
230   float font_size = GetFontSize(charinfo.m_pTextObj);
231   if (charinfo.m_pTextObj && !FXSYS_IsFloatZero(font_size)) {
232     bool is_vert_writing = charinfo.m_pTextObj->GetFont()->IsVertWriting();
233     if (is_vert_writing && charinfo.m_pTextObj->GetFont()->IsCIDFont()) {
234       CPDF_CIDFont* pCIDFont = charinfo.m_pTextObj->GetFont()->AsCIDFont();
235       uint16_t cid = pCIDFont->CIDFromCharCode(charinfo.m_CharCode);
236 
237       CFX_Point16 vertical_origin = pCIDFont->GetVertOrigin(cid);
238       double offsetx = (vertical_origin.x - 500) * font_size / 1000.0;
239       double offsety = vertical_origin.y * font_size / 1000.0;
240       int16_t vert_width = pCIDFont->GetVertWidth(cid);
241       double height = vert_width * font_size / 1000.0;
242 
243       float left = charinfo.m_Origin.x + offsetx;
244       float right = left + font_size;
245       float bottom = charinfo.m_Origin.y + offsety;
246       float top = bottom + height;
247       return CFX_FloatRect(left, bottom, right, top);
248     }
249 
250     int ascent = charinfo.m_pTextObj->GetFont()->GetTypeAscent();
251     int descent = charinfo.m_pTextObj->GetFont()->GetTypeDescent();
252     if (ascent != descent) {
253       float width = charinfo.m_Matrix.a *
254                     charinfo.m_pTextObj->GetCharWidth(charinfo.m_CharCode);
255       float font_scale = charinfo.m_Matrix.a * font_size / (ascent - descent);
256 
257       float left = charinfo.m_Origin.x;
258       float right = charinfo.m_Origin.x + (is_vert_writing ? -width : width);
259       float bottom = charinfo.m_Origin.y + descent * font_scale;
260       float top = charinfo.m_Origin.y + ascent * font_scale;
261       return CFX_FloatRect(left, bottom, right, top);
262     }
263   }
264 
265   // Fallback to the tight bounds in empty text scenarios, or bad font metrics
266   return charinfo.m_CharBox;
267 }
268 
269 }  // namespace
270 
271 CPDF_TextPage::TransformedTextObject::TransformedTextObject() = default;
272 
273 CPDF_TextPage::TransformedTextObject::TransformedTextObject(
274     const TransformedTextObject& that) = default;
275 
276 CPDF_TextPage::TransformedTextObject::~TransformedTextObject() = default;
277 
278 CPDF_TextPage::CharInfo::CharInfo() = default;
279 
280 CPDF_TextPage::CharInfo::CharInfo(const CharInfo&) = default;
281 
282 CPDF_TextPage::CharInfo::~CharInfo() = default;
283 
CPDF_TextPage(const CPDF_Page * pPage,bool rtl)284 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, bool rtl)
285     : m_pPage(pPage), m_rtl(rtl), m_DisplayMatrix(GetPageMatrix(pPage)) {
286   Init();
287 }
288 
289 CPDF_TextPage::~CPDF_TextPage() = default;
290 
Init()291 void CPDF_TextPage::Init() {
292   m_TextBuf.SetAllocStep(10240);
293   ProcessObject();
294 
295   const int nCount = CountChars();
296   if (nCount)
297     m_CharIndices.push_back({0, 0});
298 
299   bool skipped = false;
300   for (int i = 0; i < nCount; ++i) {
301     const CharInfo& charinfo = m_CharList[i];
302     if (charinfo.m_CharType == CPDF_TextPage::CharType::kGenerated ||
303         (charinfo.m_Unicode != 0 && !IsControlChar(charinfo)) ||
304         (charinfo.m_Unicode == 0 && charinfo.m_CharCode != 0)) {
305       m_CharIndices.back().count++;
306       skipped = true;
307     } else {
308       if (skipped) {
309         m_CharIndices.push_back({i + 1, 0});
310         skipped = false;
311       } else {
312         m_CharIndices.back().index = i + 1;
313       }
314     }
315   }
316 }
317 
CountChars() const318 int CPDF_TextPage::CountChars() const {
319   return fxcrt::CollectionSize<int>(m_CharList);
320 }
321 
CharIndexFromTextIndex(int text_index) const322 int CPDF_TextPage::CharIndexFromTextIndex(int text_index) const {
323   int count = 0;
324   for (const auto& info : m_CharIndices) {
325     count += info.count;
326     if (count > text_index)
327       return text_index - count + info.count + info.index;
328   }
329   return -1;
330 }
331 
TextIndexFromCharIndex(int char_index) const332 int CPDF_TextPage::TextIndexFromCharIndex(int char_index) const {
333   int count = 0;
334   for (const auto& info : m_CharIndices) {
335     int text_index = char_index - info.index;
336     if (text_index < info.count)
337       return text_index >= 0 ? text_index + count : -1;
338 
339     count += info.count;
340   }
341   return -1;
342 }
343 
GetRectArray(int start,int count) const344 std::vector<CFX_FloatRect> CPDF_TextPage::GetRectArray(int start,
345                                                        int count) const {
346   std::vector<CFX_FloatRect> rects;
347   if (start < 0 || count == 0)
348     return rects;
349 
350   const int number_of_chars = CountChars();
351   if (start >= number_of_chars)
352     return rects;
353 
354   if (count < 0 || start + count > number_of_chars)
355     count = number_of_chars - start;
356   DCHECK(count > 0);
357 
358   const CPDF_TextObject* text_object = nullptr;
359   CFX_FloatRect rect;
360   int pos = start;
361   bool is_new_rect = true;
362   while (count--) {
363     const CharInfo& charinfo = m_CharList[pos++];
364     if (charinfo.m_CharType == CPDF_TextPage::CharType::kGenerated)
365       continue;
366     if (charinfo.m_CharBox.Width() < kSizeEpsilon ||
367         charinfo.m_CharBox.Height() < kSizeEpsilon) {
368       continue;
369     }
370     if (!text_object)
371       text_object = charinfo.m_pTextObj;
372     if (text_object != charinfo.m_pTextObj) {
373       rects.push_back(rect);
374       text_object = charinfo.m_pTextObj;
375       is_new_rect = true;
376     }
377     if (is_new_rect) {
378       is_new_rect = false;
379       rect = charinfo.m_CharBox;
380       rect.Normalize();
381       continue;
382     }
383     rect.Union(charinfo.m_CharBox);
384   }
385   rects.push_back(rect);
386   return rects;
387 }
388 
GetIndexAtPos(const CFX_PointF & point,const CFX_SizeF & tolerance) const389 int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
390                                  const CFX_SizeF& tolerance) const {
391   int pos;
392   int NearPos = -1;
393   double xdif = 5000;
394   double ydif = 5000;
395   const int nCount = CountChars();
396   for (pos = 0; pos < nCount; ++pos) {
397     const CFX_FloatRect& orig_charrect = m_CharList[pos].m_CharBox;
398     if (orig_charrect.Contains(point))
399       break;
400 
401     if (tolerance.width <= 0 && tolerance.height <= 0)
402       continue;
403 
404     CFX_FloatRect charrect = orig_charrect;
405     charrect.Normalize();
406     CFX_FloatRect char_rect_ext(charrect.left - tolerance.width / 2,
407                                 charrect.bottom - tolerance.height / 2,
408                                 charrect.right + tolerance.width / 2,
409                                 charrect.top + tolerance.height / 2);
410     if (!char_rect_ext.Contains(point))
411       continue;
412 
413     double curXdif =
414         std::min(fabs(point.x - charrect.left), fabs(point.x - charrect.right));
415     double curYdif =
416         std::min(fabs(point.y - charrect.bottom), fabs(point.y - charrect.top));
417     if (curYdif + curXdif < xdif + ydif) {
418       ydif = curYdif;
419       xdif = curXdif;
420       NearPos = pos;
421     }
422   }
423   return pos < nCount ? pos : NearPos;
424 }
425 
GetTextByPredicate(const std::function<bool (const CharInfo &)> & predicate) const426 WideString CPDF_TextPage::GetTextByPredicate(
427     const std::function<bool(const CharInfo&)>& predicate) const {
428   float posy = 0;
429   bool IsContainPreChar = false;
430   bool IsAddLineFeed = false;
431   WideString strText;
432   for (const auto& charinfo : m_CharList) {
433     if (predicate(charinfo)) {
434       if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
435           IsAddLineFeed) {
436         posy = charinfo.m_Origin.y;
437         if (!strText.IsEmpty())
438           strText += L"\r\n";
439       }
440       IsContainPreChar = true;
441       IsAddLineFeed = false;
442       if (charinfo.m_Unicode)
443         strText += charinfo.m_Unicode;
444     } else if (charinfo.m_Unicode == L' ') {
445       if (IsContainPreChar) {
446         strText += L' ';
447         IsContainPreChar = false;
448         IsAddLineFeed = false;
449       }
450     } else {
451       IsContainPreChar = false;
452       IsAddLineFeed = true;
453     }
454   }
455   return strText;
456 }
457 
GetTextByRect(const CFX_FloatRect & rect) const458 WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
459   return GetTextByPredicate([&rect](const CharInfo& charinfo) {
460     return IsRectIntersect(rect, charinfo.m_CharBox);
461   });
462 }
463 
GetTextByObject(const CPDF_TextObject * pTextObj) const464 WideString CPDF_TextPage::GetTextByObject(
465     const CPDF_TextObject* pTextObj) const {
466   return GetTextByPredicate([pTextObj](const CharInfo& charinfo) {
467     return charinfo.m_pTextObj == pTextObj;
468   });
469 }
470 
GetCharInfo(size_t index) const471 const CPDF_TextPage::CharInfo& CPDF_TextPage::GetCharInfo(size_t index) const {
472   CHECK(index < m_CharList.size());
473   return m_CharList[index];
474 }
475 
GetCharFontSize(size_t index) const476 float CPDF_TextPage::GetCharFontSize(size_t index) const {
477   CHECK(index < m_CharList.size());
478   return GetFontSize(m_CharList[index].m_pTextObj);
479 }
480 
GetCharLooseBounds(size_t index) const481 CFX_FloatRect CPDF_TextPage::GetCharLooseBounds(size_t index) const {
482   return GetLooseBounds(GetCharInfo(index));
483 }
484 
GetPageText(int start,int count) const485 WideString CPDF_TextPage::GetPageText(int start, int count) const {
486   if (start < 0 || start >= CountChars() || count <= 0 || m_CharList.empty() ||
487       m_TextBuf.IsEmpty()) {
488     return WideString();
489   }
490 
491   const int count_chars = CountChars();
492   int text_start = TextIndexFromCharIndex(start);
493 
494   // If the character at |start| is a non-printing character, then
495   // TextIndexFromCharIndex will return -1, so scan ahead to the first printing
496   // character.
497   while (text_start < 0) {
498     if (start >= count_chars)
499       return WideString();
500     start++;
501     text_start = TextIndexFromCharIndex(start);
502   }
503 
504   count = std::min(count, count_chars - start);
505 
506   int last = start + count - 1;
507   int text_last = TextIndexFromCharIndex(last);
508 
509   // If the character at |last| is a non-printing character, then
510   // TextIndexFromCharIndex will return -1, so scan back to the last printing
511   // character.
512   while (text_last < 0) {
513     if (last < text_start)
514       return WideString();
515 
516     last--;
517     text_last = TextIndexFromCharIndex(last);
518   }
519 
520   if (text_last < text_start)
521     return WideString();
522 
523   int text_count = text_last - text_start + 1;
524 
525   return WideString(m_TextBuf.AsStringView().Substr(text_start, text_count));
526 }
527 
CountRects(int start,int nCount)528 int CPDF_TextPage::CountRects(int start, int nCount) {
529   if (start < 0)
530     return -1;
531 
532   m_SelRects = GetRectArray(start, nCount);
533   return fxcrt::CollectionSize<int>(m_SelRects);
534 }
535 
GetRect(int rectIndex,CFX_FloatRect * pRect) const536 bool CPDF_TextPage::GetRect(int rectIndex, CFX_FloatRect* pRect) const {
537   if (!fxcrt::IndexInBounds(m_SelRects, rectIndex))
538     return false;
539 
540   *pRect = m_SelRects[rectIndex];
541   return true;
542 }
543 
FindTextlineFlowOrientation() const544 CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation()
545     const {
546   DCHECK_NE(m_pPage->GetPageObjectCount(), 0u);
547 
548   const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth());
549   const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight());
550   if (nPageWidth <= 0 || nPageHeight <= 0)
551     return TextOrientation::kUnknown;
552 
553   std::vector<bool> nHorizontalMask(nPageWidth);
554   std::vector<bool> nVerticalMask(nPageHeight);
555   float fLineHeight = 0.0f;
556   int32_t nStartH = nPageWidth;
557   int32_t nEndH = 0;
558   int32_t nStartV = nPageHeight;
559   int32_t nEndV = 0;
560   for (const auto& pPageObj : *m_pPage) {
561     if (!pPageObj->IsText())
562       continue;
563 
564     int32_t minH = static_cast<int32_t>(
565         std::clamp<float>(pPageObj->GetRect().left, 0.0f, nPageWidth));
566     int32_t maxH = static_cast<int32_t>(
567         std::clamp<float>(pPageObj->GetRect().right, 0.0f, nPageWidth));
568     int32_t minV = static_cast<int32_t>(
569         std::clamp<float>(pPageObj->GetRect().bottom, 0.0f, nPageHeight));
570     int32_t maxV = static_cast<int32_t>(
571         std::clamp<float>(pPageObj->GetRect().top, 0.0f, nPageHeight));
572     if (minH >= maxH || minV >= maxV)
573       continue;
574 
575     for (int32_t i = minH; i < maxH; ++i)
576       nHorizontalMask[i] = true;
577     for (int32_t i = minV; i < maxV; ++i)
578       nVerticalMask[i] = true;
579 
580     nStartH = std::min(nStartH, minH);
581     nEndH = std::max(nEndH, maxH);
582     nStartV = std::min(nStartV, minV);
583     nEndV = std::max(nEndV, maxV);
584 
585     if (fLineHeight <= 0.0f)
586       fLineHeight = pPageObj->GetRect().Height();
587   }
588   const int32_t nDoubleLineHeight = 2 * fLineHeight;
589   if ((nEndV - nStartV) < nDoubleLineHeight)
590     return TextOrientation::kHorizontal;
591   if ((nEndH - nStartH) < nDoubleLineHeight)
592     return TextOrientation::kVertical;
593 
594   const float nSumH = MaskPercentFilled(nHorizontalMask, nStartH, nEndH);
595   if (nSumH > 0.8f)
596     return TextOrientation::kHorizontal;
597 
598   const float nSumV = MaskPercentFilled(nVerticalMask, nStartV, nEndV);
599   if (nSumH > nSumV)
600     return TextOrientation::kHorizontal;
601   if (nSumH < nSumV)
602     return TextOrientation::kVertical;
603   return TextOrientation::kUnknown;
604 }
605 
AppendGeneratedCharacter(wchar_t unicode,const CFX_Matrix & formMatrix)606 void CPDF_TextPage::AppendGeneratedCharacter(wchar_t unicode,
607                                              const CFX_Matrix& formMatrix) {
608   absl::optional<CharInfo> pGenerateChar = GenerateCharInfo(unicode);
609   if (!pGenerateChar.has_value())
610     return;
611 
612   m_TextBuf.AppendChar(unicode);
613   if (!formMatrix.IsIdentity())
614     pGenerateChar->m_Matrix = formMatrix;
615   m_CharList.push_back(pGenerateChar.value());
616 }
617 
ProcessObject()618 void CPDF_TextPage::ProcessObject() {
619   if (m_pPage->GetPageObjectCount() == 0)
620     return;
621 
622   m_TextlineDir = FindTextlineFlowOrientation();
623   for (auto it = m_pPage->begin(); it != m_pPage->end(); ++it) {
624     CPDF_PageObject* pObj = it->get();
625     if (!pObj)
626       continue;
627 
628     CFX_Matrix matrix;
629     if (pObj->IsText())
630       ProcessTextObject(pObj->AsText(), matrix, m_pPage, it);
631     else if (pObj->IsForm())
632       ProcessFormObject(pObj->AsForm(), matrix);
633   }
634   for (const auto& obj : mTextObjects)
635     ProcessTextObject(obj);
636 
637   mTextObjects.clear();
638   CloseTempLine();
639 }
640 
ProcessFormObject(CPDF_FormObject * pFormObj,const CFX_Matrix & formMatrix)641 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
642                                       const CFX_Matrix& formMatrix) {
643   CFX_Matrix curFormMatrix = pFormObj->form_matrix() * formMatrix;
644   const CPDF_PageObjectHolder* pHolder = pFormObj->form();
645   for (auto it = pHolder->begin(); it != pHolder->end(); ++it) {
646     CPDF_PageObject* pPageObj = it->get();
647     if (!pPageObj)
648       continue;
649 
650     if (pPageObj->IsText())
651       ProcessTextObject(pPageObj->AsText(), curFormMatrix, pHolder, it);
652     else if (pPageObj->IsForm())
653       ProcessFormObject(pPageObj->AsForm(), curFormMatrix);
654   }
655 }
656 
AddCharInfoByLRDirection(wchar_t wChar,const CharInfo & info)657 void CPDF_TextPage::AddCharInfoByLRDirection(wchar_t wChar,
658                                              const CharInfo& info) {
659   CharInfo info2 = info;
660   if (IsControlChar(info2)) {
661     info2.m_Index = -1;
662     m_CharList.push_back(info2);
663     return;
664   }
665   info2.m_Index = m_TextBuf.GetLength();
666   DataVector<wchar_t> normalized;
667   if (wChar >= 0xFB00 && wChar <= 0xFB06)
668     normalized = GetUnicodeNormalization(wChar);
669   if (normalized.empty()) {
670     m_TextBuf.AppendChar(wChar);
671     m_CharList.push_back(info2);
672     return;
673   }
674   for (wchar_t normalized_char : normalized) {
675     info2.m_Unicode = normalized_char;
676     info2.m_CharType = CPDF_TextPage::CharType::kPiece;
677     m_TextBuf.AppendChar(info2.m_Unicode);
678     m_CharList.push_back(info2);
679   }
680 }
681 
AddCharInfoByRLDirection(wchar_t wChar,const CharInfo & info)682 void CPDF_TextPage::AddCharInfoByRLDirection(wchar_t wChar,
683                                              const CharInfo& info) {
684   CharInfo info2 = info;
685   if (IsControlChar(info2)) {
686     info2.m_Index = -1;
687     m_CharList.push_back(info2);
688     return;
689   }
690   info2.m_Index = m_TextBuf.GetLength();
691   wChar = pdfium::unicode::GetMirrorChar(wChar);
692   DataVector<wchar_t> normalized = GetUnicodeNormalization(wChar);
693   if (normalized.empty()) {
694     info2.m_Unicode = wChar;
695     m_TextBuf.AppendChar(info2.m_Unicode);
696     m_CharList.push_back(info2);
697     return;
698   }
699   for (wchar_t normalized_char : normalized) {
700     info2.m_Unicode = normalized_char;
701     info2.m_CharType = CPDF_TextPage::CharType::kPiece;
702     m_TextBuf.AppendChar(info2.m_Unicode);
703     m_CharList.push_back(info2);
704   }
705 }
706 
CloseTempLine()707 void CPDF_TextPage::CloseTempLine() {
708   if (m_TempCharList.empty())
709     return;
710 
711   WideString str = m_TempTextBuf.MakeString();
712   bool bPrevSpace = false;
713   for (size_t i = 0; i < str.GetLength(); ++i) {
714     if (str[i] != ' ') {
715       bPrevSpace = false;
716       continue;
717     }
718     if (bPrevSpace) {
719       m_TempTextBuf.Delete(i, 1);
720       m_TempCharList.erase(m_TempCharList.begin() + i);
721       str.Delete(i);
722       --i;
723     }
724     bPrevSpace = true;
725   }
726   CFX_BidiString bidi(str);
727   if (m_rtl)
728     bidi.SetOverallDirectionRight();
729   CFX_BidiChar::Direction eCurrentDirection = bidi.OverallDirection();
730   for (const auto& segment : bidi) {
731     if (segment.direction == CFX_BidiChar::Direction::kRight ||
732         (segment.direction == CFX_BidiChar::Direction::kNeutral &&
733          eCurrentDirection == CFX_BidiChar::Direction::kRight)) {
734       eCurrentDirection = CFX_BidiChar::Direction::kRight;
735       for (int m = segment.start + segment.count; m > segment.start; --m)
736         AddCharInfoByRLDirection(str[m - 1], m_TempCharList[m - 1]);
737     } else {
738       if (segment.direction != CFX_BidiChar::Direction::kLeftWeak) {
739         eCurrentDirection = CFX_BidiChar::Direction::kLeft;
740       }
741       for (int m = segment.start; m < segment.start + segment.count; ++m)
742         AddCharInfoByLRDirection(str[m], m_TempCharList[m]);
743     }
744   }
745   m_TempCharList.clear();
746   m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
747 }
748 
ProcessTextObject(CPDF_TextObject * pTextObj,const CFX_Matrix & formMatrix,const CPDF_PageObjectHolder * pObjList,CPDF_PageObjectHolder::const_iterator ObjPos)749 void CPDF_TextPage::ProcessTextObject(
750     CPDF_TextObject* pTextObj,
751     const CFX_Matrix& formMatrix,
752     const CPDF_PageObjectHolder* pObjList,
753     CPDF_PageObjectHolder::const_iterator ObjPos) {
754   if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon)
755     return;
756 
757   size_t count = mTextObjects.size();
758   TransformedTextObject new_obj;
759   new_obj.m_pTextObj = pTextObj;
760   new_obj.m_formMatrix = formMatrix;
761   if (count == 0) {
762     mTextObjects.push_back(new_obj);
763     return;
764   }
765   if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos))
766     return;
767 
768   TransformedTextObject prev_obj = mTextObjects[count - 1];
769   size_t nItem = prev_obj.m_pTextObj->CountItems();
770   if (nItem == 0)
771     return;
772 
773   CPDF_TextObject::Item item = prev_obj.m_pTextObj->GetItemInfo(nItem - 1);
774   float prev_width =
775       GetCharWidth(item.m_CharCode, prev_obj.m_pTextObj->GetFont().Get()) *
776       prev_obj.m_pTextObj->GetFontSize() / 1000;
777 
778   CFX_Matrix prev_matrix =
779       prev_obj.m_pTextObj->GetTextMatrix() * prev_obj.m_formMatrix;
780   prev_width = prev_matrix.TransformDistance(fabs(prev_width));
781   item = pTextObj->GetItemInfo(0);
782   float this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont().Get()) *
783                      pTextObj->GetFontSize() / 1000;
784   this_width = fabs(this_width);
785 
786   CFX_Matrix this_matrix = pTextObj->GetTextMatrix() * formMatrix;
787   this_width = this_matrix.TransformDistance(fabs(this_width));
788 
789   float threshold = std::max(prev_width, this_width) / 4;
790   CFX_PointF prev_pos = m_DisplayMatrix.Transform(
791       prev_obj.m_formMatrix.Transform(prev_obj.m_pTextObj->GetPos()));
792   CFX_PointF this_pos =
793       m_DisplayMatrix.Transform(formMatrix.Transform(pTextObj->GetPos()));
794   if (fabs(this_pos.y - prev_pos.y) > threshold * 2) {
795     for (size_t i = 0; i < count; ++i)
796       ProcessTextObject(mTextObjects[i]);
797     mTextObjects.clear();
798     mTextObjects.push_back(new_obj);
799     return;
800   }
801 
802   for (size_t i = count; i > 0; --i) {
803     TransformedTextObject prev_text_obj = mTextObjects[i - 1];
804     CFX_PointF new_prev_pos =
805         m_DisplayMatrix.Transform(prev_text_obj.m_formMatrix.Transform(
806             prev_text_obj.m_pTextObj->GetPos()));
807     if (this_pos.x >= new_prev_pos.x) {
808       mTextObjects.insert(mTextObjects.begin() + i, new_obj);
809       return;
810     }
811   }
812   mTextObjects.insert(mTextObjects.begin(), new_obj);
813 }
814 
PreMarkedContent(const CPDF_TextObject * pTextObj)815 CPDF_TextPage::MarkedContentState CPDF_TextPage::PreMarkedContent(
816     const CPDF_TextObject* pTextObj) {
817   const CPDF_ContentMarks* pMarks = pTextObj->GetContentMarks();
818   const size_t nContentMarks = pMarks->CountItems();
819   if (nContentMarks == 0)
820     return MarkedContentState::kPass;
821 
822   WideString actText;
823   bool bExist = false;
824   RetainPtr<const CPDF_Dictionary> pDict;
825   for (size_t i = 0; i < nContentMarks; ++i) {
826     const CPDF_ContentMarkItem* item = pMarks->GetItem(i);
827     pDict = item->GetParam();
828     if (!pDict)
829       continue;
830     RetainPtr<const CPDF_String> temp = pDict->GetStringFor("ActualText");
831     if (temp) {
832       bExist = true;
833       actText = temp->GetUnicodeText();
834     }
835   }
836   if (!bExist)
837     return MarkedContentState::kPass;
838 
839   if (m_pPrevTextObj) {
840     const CPDF_ContentMarks* pPrevMarks = m_pPrevTextObj->GetContentMarks();
841     if (pPrevMarks->CountItems() == nContentMarks &&
842         pPrevMarks->GetItem(nContentMarks - 1)->GetParam() == pDict) {
843       return MarkedContentState::kDone;
844     }
845   }
846 
847   if (actText.IsEmpty())
848     return MarkedContentState::kPass;
849 
850   RetainPtr<CPDF_Font> pFont = pTextObj->GetFont();
851   bExist = false;
852   for (size_t i = 0; i < actText.GetLength(); ++i) {
853     if (pFont->CharCodeFromUnicode(actText[i]) != CPDF_Font::kInvalidCharCode) {
854       bExist = true;
855       break;
856     }
857   }
858   if (!bExist)
859     return MarkedContentState::kPass;
860 
861   bExist = false;
862   for (size_t i = 0; i < actText.GetLength(); ++i) {
863     wchar_t wChar = actText[i];
864     if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
865       bExist = true;
866       break;
867     }
868   }
869   if (!bExist)
870     return MarkedContentState::kDone;
871 
872   return MarkedContentState::kDelay;
873 }
874 
ProcessMarkedContent(const TransformedTextObject & obj)875 void CPDF_TextPage::ProcessMarkedContent(const TransformedTextObject& obj) {
876   const CPDF_TextObject* pTextObj = obj.m_pTextObj;
877   const CPDF_ContentMarks* pMarks = pTextObj->GetContentMarks();
878   const size_t nContentMarks = pMarks->CountItems();
879   WideString actText;
880   for (size_t n = 0; n < nContentMarks; ++n) {
881     const CPDF_ContentMarkItem* item = pMarks->GetItem(n);
882     RetainPtr<const CPDF_Dictionary> pDict = item->GetParam();
883     if (pDict)
884       actText = pDict->GetUnicodeTextFor("ActualText");
885   }
886   if (actText.IsEmpty())
887     return;
888 
889   RetainPtr<CPDF_Font> pFont = pTextObj->GetFont();
890   CFX_Matrix matrix = pTextObj->GetTextMatrix() * obj.m_formMatrix;
891 
892   for (size_t k = 0; k < actText.GetLength(); ++k) {
893     wchar_t wChar = actText[k];
894     if (wChar <= 0x80 && !isprint(wChar))
895       wChar = 0x20;
896     if (wChar >= 0xFFFD)
897       continue;
898 
899     CharInfo charinfo;
900     charinfo.m_Origin = pTextObj->GetPos();
901     charinfo.m_Index = m_TextBuf.GetLength();
902     charinfo.m_Unicode = wChar;
903     charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
904     charinfo.m_CharType = CPDF_TextPage::CharType::kPiece;
905     charinfo.m_pTextObj = pTextObj;
906     charinfo.m_CharBox = pTextObj->GetRect();
907     charinfo.m_Matrix = matrix;
908     m_TempTextBuf.AppendChar(wChar);
909     m_TempCharList.push_back(charinfo);
910   }
911 }
912 
FindPreviousTextObject()913 void CPDF_TextPage::FindPreviousTextObject() {
914   const CharInfo* pPrevCharInfo = GetPrevCharInfo();
915   if (!pPrevCharInfo)
916     return;
917 
918   if (pPrevCharInfo->m_pTextObj)
919     m_pPrevTextObj = pPrevCharInfo->m_pTextObj;
920 }
921 
SwapTempTextBuf(size_t iCharListStartAppend,size_t iBufStartAppend)922 void CPDF_TextPage::SwapTempTextBuf(size_t iCharListStartAppend,
923                                     size_t iBufStartAppend) {
924   DCHECK(!m_TempCharList.empty());
925   if (iCharListStartAppend < m_TempCharList.size()) {
926     auto fwd = m_TempCharList.begin() + iCharListStartAppend;
927     auto rev = m_TempCharList.end() - 1;
928     for (; fwd < rev; ++fwd, --rev) {
929       std::swap(*fwd, *rev);
930       std::swap(fwd->m_Index, rev->m_Index);
931     }
932   }
933   pdfium::span<wchar_t> temp_span = m_TempTextBuf.GetWideSpan();
934   DCHECK(!temp_span.empty());
935   if (iBufStartAppend < temp_span.size()) {
936     std::reverse(temp_span.begin() + iBufStartAppend, temp_span.end());
937   }
938 }
939 
ProcessTextObject(const TransformedTextObject & obj)940 void CPDF_TextPage::ProcessTextObject(const TransformedTextObject& obj) {
941   const CPDF_TextObject* pTextObj = obj.m_pTextObj;
942   if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon)
943     return;
944 
945   CFX_Matrix form_matrix = obj.m_formMatrix;
946   RetainPtr<CPDF_Font> pFont = pTextObj->GetFont();
947   CFX_Matrix matrix = pTextObj->GetTextMatrix() * form_matrix;
948   MarkedContentState ePreMKC = PreMarkedContent(obj.m_pTextObj);
949   if (ePreMKC == MarkedContentState::kDone) {
950     m_pPrevTextObj = pTextObj;
951     m_PrevMatrix = form_matrix;
952     return;
953   }
954   GenerateCharacter result = GenerateCharacter::kNone;
955   if (m_pPrevTextObj) {
956     result = ProcessInsertObject(pTextObj, form_matrix);
957     if (result == GenerateCharacter::kLineBreak)
958       m_CurlineRect = pTextObj->GetRect();
959     else
960       m_CurlineRect.Union(obj.m_pTextObj->GetRect());
961 
962     switch (result) {
963       case GenerateCharacter::kNone:
964         break;
965       case GenerateCharacter::kSpace: {
966         absl::optional<CharInfo> pGenerateChar = GenerateCharInfo(L' ');
967         if (pGenerateChar.has_value()) {
968           if (!form_matrix.IsIdentity())
969             pGenerateChar->m_Matrix = form_matrix;
970           m_TempTextBuf.AppendChar(L' ');
971           m_TempCharList.push_back(pGenerateChar.value());
972         }
973         break;
974       }
975       case GenerateCharacter::kLineBreak:
976         CloseTempLine();
977         if (m_TextBuf.GetSize()) {
978           AppendGeneratedCharacter(L'\r', form_matrix);
979           AppendGeneratedCharacter(L'\n', form_matrix);
980         }
981         break;
982       case GenerateCharacter::kHyphen:
983         if (pTextObj->CountChars() == 1) {
984           CPDF_TextObject::Item item = pTextObj->GetCharInfo(0);
985           WideString wstrItem =
986               pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
987           if (wstrItem.IsEmpty())
988             wstrItem += (wchar_t)item.m_CharCode;
989           wchar_t curChar = wstrItem[0];
990           if (IsHyphenCode(curChar))
991             return;
992         }
993         while (m_TempTextBuf.GetSize() > 0 &&
994                m_TempTextBuf.AsStringView().Back() == 0x20) {
995           m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
996           m_TempCharList.pop_back();
997         }
998         CharInfo* charinfo = &m_TempCharList.back();
999         m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1000         charinfo->m_Unicode = 0x2;
1001         charinfo->m_CharType = CPDF_TextPage::CharType::kHyphen;
1002         m_TempTextBuf.AppendChar(0xfffe);
1003         break;
1004     }
1005   } else {
1006     m_CurlineRect = pTextObj->GetRect();
1007   }
1008 
1009   if (ePreMKC == MarkedContentState::kDelay) {
1010     ProcessMarkedContent(obj);
1011     m_pPrevTextObj = pTextObj;
1012     m_PrevMatrix = form_matrix;
1013     return;
1014   }
1015   m_pPrevTextObj = pTextObj;
1016   m_PrevMatrix = form_matrix;
1017   float baseSpace = CalculateBaseSpace(pTextObj, matrix);
1018 
1019   const bool bR2L = IsRightToLeft(*pTextObj, *pFont);
1020   const bool bIsBidiAndMirrorInverse =
1021       bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1022   const size_t iBufStartAppend = m_TempTextBuf.GetLength();
1023   const size_t iCharListStartAppend = m_TempCharList.size();
1024 
1025   float spacing = 0;
1026   const size_t nItems = pTextObj->CountItems();
1027   for (size_t i = 0; i < nItems; ++i) {
1028     CharInfo charinfo;
1029     CPDF_TextObject::Item item = pTextObj->GetItemInfo(i);
1030     if (item.m_CharCode == 0xffffffff) {
1031       WideString str = m_TempTextBuf.MakeString();
1032       if (str.IsEmpty())
1033         str = m_TextBuf.AsStringView();
1034       if (str.IsEmpty() || str.Back() == L' ')
1035         continue;
1036 
1037       float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1038       spacing = -fontsize_h * item.m_Origin.x / 1000;
1039       continue;
1040     }
1041     float charSpace = pTextObj->m_TextState.GetCharSpace();
1042     if (charSpace > 0.001)
1043       spacing += matrix.TransformDistance(charSpace);
1044     else if (charSpace < -0.001)
1045       spacing -= matrix.TransformDistance(fabs(charSpace));
1046     spacing -= baseSpace;
1047     if (spacing && i > 0) {
1048       float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1049       uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
1050       float threshold = 0;
1051       if (space_charcode != CPDF_Font::kInvalidCharCode)
1052         threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1053       if (threshold > fontsize_h / 3)
1054         threshold = 0;
1055       else
1056         threshold /= 2;
1057       if (threshold == 0) {
1058         threshold = GetCharWidth(item.m_CharCode, pFont.Get());
1059         threshold = NormalizeThreshold(threshold, 300, 500, 700);
1060         threshold = fontsize_h * threshold / 1000;
1061       }
1062       if (threshold && (spacing && spacing >= threshold)) {
1063         charinfo.m_Unicode = L' ';
1064         charinfo.m_CharType = CPDF_TextPage::CharType::kGenerated;
1065         charinfo.m_pTextObj = pTextObj;
1066         charinfo.m_Index = m_TextBuf.GetLength();
1067         m_TempTextBuf.AppendChar(L' ');
1068         charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
1069         charinfo.m_Matrix = form_matrix;
1070         charinfo.m_Origin = matrix.Transform(item.m_Origin);
1071         charinfo.m_CharBox =
1072             CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
1073                           charinfo.m_Origin.x, charinfo.m_Origin.y);
1074         m_TempCharList.push_back(charinfo);
1075       }
1076       if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
1077         continue;
1078     }
1079     spacing = 0;
1080     WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1081     bool bNoUnicode = false;
1082     if (wstrItem.IsEmpty() && item.m_CharCode) {
1083       wstrItem += static_cast<wchar_t>(item.m_CharCode);
1084       bNoUnicode = true;
1085     }
1086     charinfo.m_Index = -1;
1087     charinfo.m_CharCode = item.m_CharCode;
1088     charinfo.m_CharType = bNoUnicode ? CPDF_TextPage::CharType::kNotUnicode
1089                                      : CPDF_TextPage::CharType::kNormal;
1090     charinfo.m_pTextObj = pTextObj;
1091     charinfo.m_Origin = matrix.Transform(item.m_Origin);
1092 
1093     const FX_RECT rect =
1094         charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
1095     const float fFontSize = pTextObj->GetFontSize() / 1000;
1096     charinfo.m_CharBox.top = rect.top * fFontSize + item.m_Origin.y;
1097     charinfo.m_CharBox.left = rect.left * fFontSize + item.m_Origin.x;
1098     charinfo.m_CharBox.right = rect.right * fFontSize + item.m_Origin.x;
1099     charinfo.m_CharBox.bottom = rect.bottom * fFontSize + item.m_Origin.y;
1100     if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) <
1101         kSizeEpsilon) {
1102       charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + fFontSize;
1103     }
1104     if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) <
1105         kSizeEpsilon) {
1106       charinfo.m_CharBox.right =
1107           charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1108     }
1109     charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox);
1110     charinfo.m_Matrix = matrix;
1111     if (wstrItem.IsEmpty()) {
1112       charinfo.m_Unicode = 0;
1113       m_TempCharList.push_back(charinfo);
1114       m_TempTextBuf.AppendChar(0xfffe);
1115       continue;
1116     }
1117     size_t nTotal = wstrItem.GetLength();
1118     bool bDel = false;
1119     const int count = std::min(fxcrt::CollectionSize<int>(m_TempCharList), 7);
1120     constexpr float kTextCharRatioGapDelta = 0.07f;
1121     float threshold = charinfo.m_Matrix.TransformXDistance(
1122         kTextCharRatioGapDelta * pTextObj->GetFontSize());
1123     for (int n = fxcrt::CollectionSize<int>(m_TempCharList);
1124          n > fxcrt::CollectionSize<int>(m_TempCharList) - count; --n) {
1125       const CharInfo& charinfo1 = m_TempCharList[n - 1];
1126       CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
1127       if (charinfo1.m_CharCode == charinfo.m_CharCode &&
1128           charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
1129           fabs(diff.x) < threshold && fabs(diff.y) < threshold) {
1130         bDel = true;
1131         break;
1132       }
1133     }
1134     if (!bDel) {
1135       for (size_t nIndex = 0; nIndex < nTotal; ++nIndex) {
1136         charinfo.m_Unicode = wstrItem[nIndex];
1137         if (charinfo.m_Unicode) {
1138           charinfo.m_Index = m_TextBuf.GetLength();
1139           m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1140         } else {
1141           m_TempTextBuf.AppendChar(0xfffe);
1142         }
1143         m_TempCharList.push_back(charinfo);
1144       }
1145     } else if (i == 0) {
1146       WideString str = m_TempTextBuf.MakeString();
1147       if (!str.IsEmpty() && str.Back() == L' ') {
1148         m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1149         m_TempCharList.pop_back();
1150       }
1151     }
1152   }
1153   if (bIsBidiAndMirrorInverse)
1154     SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1155 }
1156 
GetTextObjectWritingMode(const CPDF_TextObject * pTextObj) const1157 CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode(
1158     const CPDF_TextObject* pTextObj) const {
1159   size_t nChars = pTextObj->CountChars();
1160   if (nChars <= 1)
1161     return m_TextlineDir;
1162 
1163   CPDF_TextObject::Item first = pTextObj->GetCharInfo(0);
1164   CPDF_TextObject::Item last = pTextObj->GetCharInfo(nChars - 1);
1165   CFX_Matrix textMatrix = pTextObj->GetTextMatrix();
1166   first.m_Origin = textMatrix.Transform(first.m_Origin);
1167   last.m_Origin = textMatrix.Transform(last.m_Origin);
1168 
1169   static constexpr float kEpsilon = 0.0001f;
1170   float dX = fabs(last.m_Origin.x - first.m_Origin.x);
1171   float dY = fabs(last.m_Origin.y - first.m_Origin.y);
1172   if (dX <= kEpsilon && dY <= kEpsilon)
1173     return TextOrientation::kUnknown;
1174 
1175   static constexpr float kThreshold = 0.0872f;
1176   CFX_VectorF v(dX, dY);
1177   v.Normalize();
1178   bool bXUnderThreshold = v.x <= kThreshold;
1179   if (v.y <= kThreshold)
1180     return bXUnderThreshold ? m_TextlineDir : TextOrientation::kHorizontal;
1181   return bXUnderThreshold ? TextOrientation::kVertical : m_TextlineDir;
1182 }
1183 
IsHyphen(wchar_t curChar) const1184 bool CPDF_TextPage::IsHyphen(wchar_t curChar) const {
1185   WideStringView curText = m_TempTextBuf.AsStringView();
1186   if (curText.IsEmpty())
1187     curText = m_TextBuf.AsStringView();
1188 
1189   if (curText.IsEmpty())
1190     return false;
1191 
1192   auto iter = curText.rbegin();
1193   for (; (iter + 1) != curText.rend() && *iter == 0x20; ++iter) {
1194     // Do nothing
1195   }
1196 
1197   if (!IsHyphenCode(*iter))
1198     return false;
1199 
1200   if ((iter + 1) != curText.rend()) {
1201     iter++;
1202     if (FXSYS_iswalpha(*iter) && FXSYS_iswalnum(curChar))
1203       return true;
1204   }
1205 
1206   const CharInfo* pPrevCharInfo = GetPrevCharInfo();
1207   return pPrevCharInfo &&
1208          pPrevCharInfo->m_CharType == CPDF_TextPage::CharType::kPiece &&
1209          IsHyphenCode(pPrevCharInfo->m_Unicode);
1210 }
1211 
GetPrevCharInfo() const1212 const CPDF_TextPage::CharInfo* CPDF_TextPage::GetPrevCharInfo() const {
1213   if (!m_TempCharList.empty())
1214     return &m_TempCharList.back();
1215   return !m_CharList.empty() ? &m_CharList.back() : nullptr;
1216 }
1217 
ProcessInsertObject(const CPDF_TextObject * pObj,const CFX_Matrix & formMatrix)1218 CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject(
1219     const CPDF_TextObject* pObj,
1220     const CFX_Matrix& formMatrix) {
1221   FindPreviousTextObject();
1222   TextOrientation WritingMode = GetTextObjectWritingMode(pObj);
1223   if (WritingMode == TextOrientation::kUnknown)
1224     WritingMode = GetTextObjectWritingMode(m_pPrevTextObj);
1225 
1226   size_t nItem = m_pPrevTextObj->CountItems();
1227   if (nItem == 0)
1228     return GenerateCharacter::kNone;
1229 
1230   CPDF_TextObject::Item PrevItem = m_pPrevTextObj->GetItemInfo(nItem - 1);
1231   CPDF_TextObject::Item item = pObj->GetItemInfo(0);
1232   const CFX_FloatRect& this_rect = pObj->GetRect();
1233   const CFX_FloatRect& prev_rect = m_pPrevTextObj->GetRect();
1234   WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1235   if (wstrItem.IsEmpty())
1236     wstrItem += static_cast<wchar_t>(item.m_CharCode);
1237 
1238   wchar_t curChar = wstrItem[0];
1239   if (WritingMode == TextOrientation::kHorizontal) {
1240     if (EndHorizontalLine(this_rect, prev_rect)) {
1241       return IsHyphen(curChar) ? GenerateCharacter::kHyphen
1242                                : GenerateCharacter::kLineBreak;
1243     }
1244   } else if (WritingMode == TextOrientation::kVertical) {
1245     if (EndVerticalLine(this_rect, prev_rect, m_CurlineRect,
1246                         pObj->GetFontSize(), m_pPrevTextObj->GetFontSize())) {
1247       return IsHyphen(curChar) ? GenerateCharacter::kHyphen
1248                                : GenerateCharacter::kLineBreak;
1249     }
1250   }
1251 
1252   float last_pos = PrevItem.m_Origin.x;
1253   int nLastWidth =
1254       GetCharWidth(PrevItem.m_CharCode, m_pPrevTextObj->GetFont().Get());
1255   float last_width = nLastWidth * m_pPrevTextObj->GetFontSize() / 1000;
1256   last_width = fabs(last_width);
1257   int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont().Get());
1258   float this_width = fabs(nThisWidth * pObj->GetFontSize() / 1000);
1259   float threshold = std::max(last_width, this_width) / 4;
1260 
1261   CFX_Matrix prev_matrix = m_pPrevTextObj->GetTextMatrix() * m_PrevMatrix;
1262   CFX_Matrix prev_reverse = prev_matrix.GetInverse();
1263 
1264   CFX_PointF pos = prev_reverse.Transform(formMatrix.Transform(pObj->GetPos()));
1265   if (last_width < this_width)
1266     threshold = prev_reverse.TransformDistance(threshold);
1267 
1268   bool bNewline = false;
1269   if (WritingMode == TextOrientation::kHorizontal) {
1270     CFX_FloatRect rect = m_pPrevTextObj->GetRect();
1271     float rect_height = rect.Height();
1272     rect.Normalize();
1273     if ((rect.IsEmpty() && rect_height > 5) ||
1274         ((pos.y > threshold * 2 || pos.y < threshold * -3) &&
1275          (fabs(pos.y) >= 1 || fabs(pos.y) > fabs(pos.x)))) {
1276       bNewline = true;
1277       if (nItem > 1) {
1278         CPDF_TextObject::Item tempItem = m_pPrevTextObj->GetItemInfo(0);
1279         CFX_Matrix m = m_pPrevTextObj->GetTextMatrix();
1280         if (PrevItem.m_Origin.x > tempItem.m_Origin.x &&
1281             m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1282             m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1283             m.c < 0.1) {
1284           CFX_FloatRect re(0, m_pPrevTextObj->GetRect().bottom, 1000,
1285                            m_pPrevTextObj->GetRect().top);
1286           if (re.Contains(pObj->GetPos())) {
1287             bNewline = false;
1288           } else {
1289             if (CFX_FloatRect(0, pObj->GetRect().bottom, 1000,
1290                               pObj->GetRect().top)
1291                     .Contains(m_pPrevTextObj->GetPos())) {
1292               bNewline = false;
1293             }
1294           }
1295         }
1296       }
1297     }
1298   }
1299   if (bNewline) {
1300     return IsHyphen(curChar) ? GenerateCharacter::kHyphen
1301                              : GenerateCharacter::kLineBreak;
1302   }
1303 
1304   if (pObj->CountChars() == 1 && IsHyphenCode(curChar) && IsHyphen(curChar))
1305     return GenerateCharacter::kHyphen;
1306 
1307   if (curChar == L' ')
1308     return GenerateCharacter::kNone;
1309 
1310   WideString PrevStr =
1311       m_pPrevTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1312   wchar_t preChar = PrevStr.Back();
1313   if (preChar == L' ')
1314     return GenerateCharacter::kNone;
1315 
1316   CFX_Matrix matrix = pObj->GetTextMatrix() * formMatrix;
1317   float threshold2 = std::max(nLastWidth, nThisWidth);
1318   threshold2 = NormalizeThreshold(threshold2, 400, 700, 800);
1319   if (nLastWidth >= nThisWidth) {
1320     threshold2 *= fabs(m_pPrevTextObj->GetFontSize());
1321   } else {
1322     threshold2 *= fabs(pObj->GetFontSize());
1323     threshold2 = matrix.TransformDistance(threshold2);
1324     threshold2 = prev_reverse.TransformDistance(threshold2);
1325   }
1326   threshold2 /= 1000;
1327   if ((threshold2 < 1.4881 && threshold2 > 1.4879) ||
1328       (threshold2 < 1.39001 && threshold2 > 1.38999)) {
1329     threshold2 *= 1.5;
1330   }
1331   return GenerateSpace(pos, last_pos, this_width, last_width, threshold2)
1332              ? GenerateCharacter::kSpace
1333              : GenerateCharacter::kNone;
1334 }
1335 
IsSameTextObject(CPDF_TextObject * pTextObj1,CPDF_TextObject * pTextObj2) const1336 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1337                                      CPDF_TextObject* pTextObj2) const {
1338   if (!pTextObj1 || !pTextObj2)
1339     return false;
1340 
1341   CFX_FloatRect rcPreObj = pTextObj2->GetRect();
1342   const CFX_FloatRect& rcCurObj = pTextObj1->GetRect();
1343   if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
1344     float dbXdif = fabs(rcPreObj.left - rcCurObj.left);
1345     size_t nCount = m_CharList.size();
1346     if (nCount >= 2) {
1347       float dbSpace = m_CharList[nCount - 2].m_CharBox.Width();
1348       if (dbXdif > dbSpace)
1349         return false;
1350     }
1351   }
1352   if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
1353     rcPreObj.Intersect(rcCurObj);
1354     if (rcPreObj.IsEmpty())
1355       return false;
1356     if (fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
1357       return false;
1358     }
1359     if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize())
1360       return false;
1361   }
1362 
1363   size_t nPreCount = pTextObj2->CountItems();
1364   if (nPreCount != pTextObj1->CountItems())
1365     return false;
1366 
1367   // If both objects have no items, consider them same.
1368   if (nPreCount == 0)
1369     return true;
1370 
1371   CPDF_TextObject::Item itemPer;
1372   CPDF_TextObject::Item itemCur;
1373   for (size_t i = 0; i < nPreCount; ++i) {
1374     itemPer = pTextObj2->GetItemInfo(i);
1375     itemCur = pTextObj1->GetItemInfo(i);
1376     if (itemCur.m_CharCode != itemPer.m_CharCode)
1377       return false;
1378   }
1379 
1380   CFX_PointF diff = pTextObj1->GetPos() - pTextObj2->GetPos();
1381   float font_size = pTextObj2->GetFontSize();
1382   float char_size =
1383       GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont().Get());
1384   float max_pre_size =
1385       std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), font_size);
1386   return fabs(diff.x) <= 0.9 * char_size * font_size / 1000 &&
1387          fabs(diff.y) <= max_pre_size / 8;
1388 }
1389 
IsSameAsPreTextObject(CPDF_TextObject * pTextObj,const CPDF_PageObjectHolder * pObjList,CPDF_PageObjectHolder::const_iterator iter) const1390 bool CPDF_TextPage::IsSameAsPreTextObject(
1391     CPDF_TextObject* pTextObj,
1392     const CPDF_PageObjectHolder* pObjList,
1393     CPDF_PageObjectHolder::const_iterator iter) const {
1394   int i = 0;
1395   while (i < 5 && iter != pObjList->begin()) {
1396     --iter;
1397     CPDF_PageObject* pOtherObj = iter->get();
1398     if (pOtherObj == pTextObj || !pOtherObj->IsText())
1399       continue;
1400     if (IsSameTextObject(pOtherObj->AsText(), pTextObj))
1401       return true;
1402     ++i;
1403   }
1404   return false;
1405 }
1406 
GenerateCharInfo(wchar_t unicode)1407 absl::optional<CPDF_TextPage::CharInfo> CPDF_TextPage::GenerateCharInfo(
1408     wchar_t unicode) {
1409   const CharInfo* pPrevCharInfo = GetPrevCharInfo();
1410   if (!pPrevCharInfo)
1411     return absl::nullopt;
1412 
1413   CharInfo info;
1414   info.m_Index = m_TextBuf.GetLength();
1415   info.m_CharCode = CPDF_Font::kInvalidCharCode;
1416   info.m_Unicode = unicode;
1417   info.m_CharType = CPDF_TextPage::CharType::kGenerated;
1418 
1419   int preWidth = 0;
1420   if (pPrevCharInfo->m_pTextObj &&
1421       pPrevCharInfo->m_CharCode != CPDF_Font::kInvalidCharCode) {
1422     preWidth = GetCharWidth(pPrevCharInfo->m_CharCode,
1423                             pPrevCharInfo->m_pTextObj->GetFont().Get());
1424   }
1425 
1426   float fFontSize = pPrevCharInfo->m_pTextObj
1427                         ? pPrevCharInfo->m_pTextObj->GetFontSize()
1428                         : pPrevCharInfo->m_CharBox.Height();
1429   if (!fFontSize)
1430     fFontSize = kDefaultFontSize;
1431 
1432   info.m_Origin =
1433       CFX_PointF(pPrevCharInfo->m_Origin.x + preWidth * (fFontSize) / 1000,
1434                  pPrevCharInfo->m_Origin.y);
1435   info.m_CharBox = CFX_FloatRect(info.m_Origin.x, info.m_Origin.y,
1436                                  info.m_Origin.x, info.m_Origin.y);
1437   return info;
1438 }
1439