1 // Copyright 2014 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdftext/cpdf_textpage.h"
8
9 #include <math.h>
10 #include <stdint.h>
11
12 #include <algorithm>
13 #include <utility>
14 #include <vector>
15
16 #include "core/fpdfapi/font/cpdf_cidfont.h"
17 #include "core/fpdfapi/font/cpdf_font.h"
18 #include "core/fpdfapi/page/cpdf_form.h"
19 #include "core/fpdfapi/page/cpdf_formobject.h"
20 #include "core/fpdfapi/page/cpdf_page.h"
21 #include "core/fpdfapi/page/cpdf_pageobject.h"
22 #include "core/fpdfapi/page/cpdf_textobject.h"
23 #include "core/fpdfapi/parser/cpdf_dictionary.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdftext/unicodenormalizationdata.h"
26 #include "core/fxcrt/data_vector.h"
27 #include "core/fxcrt/fx_bidi.h"
28 #include "core/fxcrt/fx_extension.h"
29 #include "core/fxcrt/fx_unicode.h"
30 #include "core/fxcrt/stl_util.h"
31 #include "third_party/base/check.h"
32 #include "third_party/base/check_op.h"
33
34 namespace {
35
36 constexpr float kDefaultFontSize = 1.0f;
37 constexpr float kSizeEpsilon = 0.01f;
38
39 const uint16_t* const kUnicodeDataNormalizationMaps[] = {
40 kUnicodeDataNormalizationMap2, kUnicodeDataNormalizationMap3,
41 kUnicodeDataNormalizationMap4};
42
NormalizeThreshold(float threshold,int t1,int t2,int t3)43 float NormalizeThreshold(float threshold, int t1, int t2, int t3) {
44 DCHECK(t1 < t2);
45 DCHECK(t2 < t3);
46 if (threshold < t1)
47 return threshold / 2.0f;
48 if (threshold < t2)
49 return threshold / 4.0f;
50 if (threshold < t3)
51 return threshold / 5.0f;
52 return threshold / 6.0f;
53 }
54
CalculateBaseSpace(const CPDF_TextObject * pTextObj,const CFX_Matrix & matrix)55 float CalculateBaseSpace(const CPDF_TextObject* pTextObj,
56 const CFX_Matrix& matrix) {
57 const size_t nItems = pTextObj->CountItems();
58 if (!pTextObj->m_TextState.GetCharSpace() || nItems < 3)
59 return 0.0f;
60
61 bool bAllChar = true;
62 float spacing =
63 matrix.TransformDistance(pTextObj->m_TextState.GetCharSpace());
64 float baseSpace = spacing;
65 for (size_t i = 0; i < nItems; ++i) {
66 CPDF_TextObject::Item item = pTextObj->GetItemInfo(i);
67 if (item.m_CharCode == 0xffffffff) {
68 float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
69 float kerning = -fontsize_h * item.m_Origin.x / 1000;
70 baseSpace = std::min(baseSpace, kerning + spacing);
71 bAllChar = false;
72 }
73 }
74 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar))
75 return 0.0f;
76
77 return baseSpace;
78 }
79
GetUnicodeNormalization(wchar_t wch)80 DataVector<wchar_t> GetUnicodeNormalization(wchar_t wch) {
81 wch = wch & 0xFFFF;
82 wchar_t wFind = kUnicodeDataNormalization[wch];
83 if (!wFind)
84 return DataVector<wchar_t>(1, wch);
85
86 if (wFind >= 0x8000) {
87 return DataVector<wchar_t>(1,
88 kUnicodeDataNormalizationMap1[wFind - 0x8000]);
89 }
90
91 wch = wFind & 0x0FFF;
92 wFind >>= 12;
93 const uint16_t* pMap = kUnicodeDataNormalizationMaps[wFind - 2] + wch;
94 if (wFind == 4)
95 wFind = static_cast<wchar_t>(*pMap++);
96
97 return DataVector<wchar_t>(pMap, pMap + wFind);
98 }
99
MaskPercentFilled(const std::vector<bool> & mask,int32_t start,int32_t end)100 float MaskPercentFilled(const std::vector<bool>& mask,
101 int32_t start,
102 int32_t end) {
103 if (start >= end)
104 return 0;
105 float count = std::count_if(mask.begin() + start, mask.begin() + end,
106 [](bool r) { return r; });
107 return count / (end - start);
108 }
109
IsControlChar(const CPDF_TextPage::CharInfo & char_info)110 bool IsControlChar(const CPDF_TextPage::CharInfo& char_info) {
111 switch (char_info.m_Unicode) {
112 case 0x2:
113 case 0x3:
114 case 0x93:
115 case 0x94:
116 case 0x96:
117 case 0x97:
118 case 0x98:
119 case 0xfffe:
120 return char_info.m_CharType != CPDF_TextPage::CharType::kHyphen;
121 default:
122 return false;
123 }
124 }
125
IsHyphenCode(wchar_t c)126 bool IsHyphenCode(wchar_t c) {
127 return c == 0x2D || c == 0xAD;
128 }
129
IsRectIntersect(const CFX_FloatRect & rect1,const CFX_FloatRect & rect2)130 bool IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2) {
131 CFX_FloatRect rect = rect1;
132 rect.Intersect(rect2);
133 return !rect.IsEmpty();
134 }
135
IsRightToLeft(const CPDF_TextObject & text_obj,const CPDF_Font & font)136 bool IsRightToLeft(const CPDF_TextObject& text_obj, const CPDF_Font& font) {
137 const size_t nItems = text_obj.CountItems();
138 WideString str;
139 str.Reserve(nItems);
140 for (size_t i = 0; i < nItems; ++i) {
141 CPDF_TextObject::Item item = text_obj.GetItemInfo(i);
142 if (item.m_CharCode == 0xffffffff)
143 continue;
144 WideString wstrItem = font.UnicodeFromCharCode(item.m_CharCode);
145 wchar_t wChar = !wstrItem.IsEmpty() ? wstrItem[0] : 0;
146 if (wChar == 0)
147 wChar = item.m_CharCode;
148 if (wChar)
149 str += wChar;
150 }
151 return CFX_BidiString(str).OverallDirection() ==
152 CFX_BidiChar::Direction::kRight;
153 }
154
GetCharWidth(uint32_t charCode,CPDF_Font * pFont)155 int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) {
156 if (charCode == CPDF_Font::kInvalidCharCode)
157 return 0;
158
159 int w = pFont->GetCharWidthF(charCode);
160 if (w > 0)
161 return w;
162
163 ByteString str;
164 pFont->AppendChar(&str, charCode);
165 w = pFont->GetStringWidth(str.AsStringView());
166 if (w > 0)
167 return w;
168
169 FX_RECT rect = pFont->GetCharBBox(charCode);
170 if (!rect.Valid())
171 return 0;
172
173 return std::max(rect.Width(), 0);
174 }
175
GenerateSpace(const CFX_PointF & pos,float last_pos,float this_width,float last_width,float threshold)176 bool GenerateSpace(const CFX_PointF& pos,
177 float last_pos,
178 float this_width,
179 float last_width,
180 float threshold) {
181 if (fabs(last_pos + last_width - pos.x) <= threshold)
182 return false;
183
184 float threshold_pos = threshold + last_width;
185 float pos_difference = pos.x - last_pos;
186 if (fabs(pos_difference) > threshold_pos)
187 return true;
188 if (pos.x < 0 && -threshold_pos > pos_difference)
189 return true;
190 return pos_difference > this_width + last_width;
191 }
192
EndHorizontalLine(const CFX_FloatRect & this_rect,const CFX_FloatRect & prev_rect)193 bool EndHorizontalLine(const CFX_FloatRect& this_rect,
194 const CFX_FloatRect& prev_rect) {
195 if (this_rect.Height() <= 4.5 || prev_rect.Height() <= 4.5)
196 return false;
197
198 float top = std::min(this_rect.top, prev_rect.top);
199 float bottom = std::max(this_rect.bottom, prev_rect.bottom);
200 return bottom >= top;
201 }
202
EndVerticalLine(const CFX_FloatRect & this_rect,const CFX_FloatRect & prev_rect,const CFX_FloatRect & curline_rect,float this_fontsize,float prev_fontsize)203 bool EndVerticalLine(const CFX_FloatRect& this_rect,
204 const CFX_FloatRect& prev_rect,
205 const CFX_FloatRect& curline_rect,
206 float this_fontsize,
207 float prev_fontsize) {
208 if (this_rect.Width() <= this_fontsize * 0.1f ||
209 prev_rect.Width() <= prev_fontsize * 0.1f) {
210 return false;
211 }
212
213 float left = std::max(this_rect.left, curline_rect.left);
214 float right = std::min(this_rect.right, curline_rect.right);
215 return right <= left;
216 }
217
GetPageMatrix(const CPDF_Page * pPage)218 CFX_Matrix GetPageMatrix(const CPDF_Page* pPage) {
219 const FX_RECT rect(0, 0, static_cast<int>(pPage->GetPageWidth()),
220 static_cast<int>(pPage->GetPageHeight()));
221 return pPage->GetDisplayMatrix(rect, 0);
222 }
223
GetFontSize(const CPDF_TextObject * text_object)224 float GetFontSize(const CPDF_TextObject* text_object) {
225 bool has_font = text_object && text_object->GetFont();
226 return has_font ? text_object->GetFontSize() : kDefaultFontSize;
227 }
228
GetLooseBounds(const CPDF_TextPage::CharInfo & charinfo)229 CFX_FloatRect GetLooseBounds(const CPDF_TextPage::CharInfo& charinfo) {
230 float font_size = GetFontSize(charinfo.m_pTextObj);
231 if (charinfo.m_pTextObj && !FXSYS_IsFloatZero(font_size)) {
232 bool is_vert_writing = charinfo.m_pTextObj->GetFont()->IsVertWriting();
233 if (is_vert_writing && charinfo.m_pTextObj->GetFont()->IsCIDFont()) {
234 CPDF_CIDFont* pCIDFont = charinfo.m_pTextObj->GetFont()->AsCIDFont();
235 uint16_t cid = pCIDFont->CIDFromCharCode(charinfo.m_CharCode);
236
237 CFX_Point16 vertical_origin = pCIDFont->GetVertOrigin(cid);
238 double offsetx = (vertical_origin.x - 500) * font_size / 1000.0;
239 double offsety = vertical_origin.y * font_size / 1000.0;
240 int16_t vert_width = pCIDFont->GetVertWidth(cid);
241 double height = vert_width * font_size / 1000.0;
242
243 float left = charinfo.m_Origin.x + offsetx;
244 float right = left + font_size;
245 float bottom = charinfo.m_Origin.y + offsety;
246 float top = bottom + height;
247 return CFX_FloatRect(left, bottom, right, top);
248 }
249
250 int ascent = charinfo.m_pTextObj->GetFont()->GetTypeAscent();
251 int descent = charinfo.m_pTextObj->GetFont()->GetTypeDescent();
252 if (ascent != descent) {
253 float width = charinfo.m_Matrix.a *
254 charinfo.m_pTextObj->GetCharWidth(charinfo.m_CharCode);
255 float font_scale = charinfo.m_Matrix.a * font_size / (ascent - descent);
256
257 float left = charinfo.m_Origin.x;
258 float right = charinfo.m_Origin.x + (is_vert_writing ? -width : width);
259 float bottom = charinfo.m_Origin.y + descent * font_scale;
260 float top = charinfo.m_Origin.y + ascent * font_scale;
261 return CFX_FloatRect(left, bottom, right, top);
262 }
263 }
264
265 // Fallback to the tight bounds in empty text scenarios, or bad font metrics
266 return charinfo.m_CharBox;
267 }
268
269 } // namespace
270
271 CPDF_TextPage::TransformedTextObject::TransformedTextObject() = default;
272
273 CPDF_TextPage::TransformedTextObject::TransformedTextObject(
274 const TransformedTextObject& that) = default;
275
276 CPDF_TextPage::TransformedTextObject::~TransformedTextObject() = default;
277
278 CPDF_TextPage::CharInfo::CharInfo() = default;
279
280 CPDF_TextPage::CharInfo::CharInfo(const CharInfo&) = default;
281
282 CPDF_TextPage::CharInfo::~CharInfo() = default;
283
CPDF_TextPage(const CPDF_Page * pPage,bool rtl)284 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, bool rtl)
285 : m_pPage(pPage), m_rtl(rtl), m_DisplayMatrix(GetPageMatrix(pPage)) {
286 Init();
287 }
288
289 CPDF_TextPage::~CPDF_TextPage() = default;
290
Init()291 void CPDF_TextPage::Init() {
292 m_TextBuf.SetAllocStep(10240);
293 ProcessObject();
294
295 const int nCount = CountChars();
296 if (nCount)
297 m_CharIndices.push_back({0, 0});
298
299 bool skipped = false;
300 for (int i = 0; i < nCount; ++i) {
301 const CharInfo& charinfo = m_CharList[i];
302 if (charinfo.m_CharType == CPDF_TextPage::CharType::kGenerated ||
303 (charinfo.m_Unicode != 0 && !IsControlChar(charinfo)) ||
304 (charinfo.m_Unicode == 0 && charinfo.m_CharCode != 0)) {
305 m_CharIndices.back().count++;
306 skipped = true;
307 } else {
308 if (skipped) {
309 m_CharIndices.push_back({i + 1, 0});
310 skipped = false;
311 } else {
312 m_CharIndices.back().index = i + 1;
313 }
314 }
315 }
316 }
317
CountChars() const318 int CPDF_TextPage::CountChars() const {
319 return fxcrt::CollectionSize<int>(m_CharList);
320 }
321
CharIndexFromTextIndex(int text_index) const322 int CPDF_TextPage::CharIndexFromTextIndex(int text_index) const {
323 int count = 0;
324 for (const auto& info : m_CharIndices) {
325 count += info.count;
326 if (count > text_index)
327 return text_index - count + info.count + info.index;
328 }
329 return -1;
330 }
331
TextIndexFromCharIndex(int char_index) const332 int CPDF_TextPage::TextIndexFromCharIndex(int char_index) const {
333 int count = 0;
334 for (const auto& info : m_CharIndices) {
335 int text_index = char_index - info.index;
336 if (text_index < info.count)
337 return text_index >= 0 ? text_index + count : -1;
338
339 count += info.count;
340 }
341 return -1;
342 }
343
GetRectArray(int start,int count) const344 std::vector<CFX_FloatRect> CPDF_TextPage::GetRectArray(int start,
345 int count) const {
346 std::vector<CFX_FloatRect> rects;
347 if (start < 0 || count == 0)
348 return rects;
349
350 const int number_of_chars = CountChars();
351 if (start >= number_of_chars)
352 return rects;
353
354 if (count < 0 || start + count > number_of_chars)
355 count = number_of_chars - start;
356 DCHECK(count > 0);
357
358 const CPDF_TextObject* text_object = nullptr;
359 CFX_FloatRect rect;
360 int pos = start;
361 bool is_new_rect = true;
362 while (count--) {
363 const CharInfo& charinfo = m_CharList[pos++];
364 if (charinfo.m_CharType == CPDF_TextPage::CharType::kGenerated)
365 continue;
366 if (charinfo.m_CharBox.Width() < kSizeEpsilon ||
367 charinfo.m_CharBox.Height() < kSizeEpsilon) {
368 continue;
369 }
370 if (!text_object)
371 text_object = charinfo.m_pTextObj;
372 if (text_object != charinfo.m_pTextObj) {
373 rects.push_back(rect);
374 text_object = charinfo.m_pTextObj;
375 is_new_rect = true;
376 }
377 if (is_new_rect) {
378 is_new_rect = false;
379 rect = charinfo.m_CharBox;
380 rect.Normalize();
381 continue;
382 }
383 rect.Union(charinfo.m_CharBox);
384 }
385 rects.push_back(rect);
386 return rects;
387 }
388
GetIndexAtPos(const CFX_PointF & point,const CFX_SizeF & tolerance) const389 int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
390 const CFX_SizeF& tolerance) const {
391 int pos;
392 int NearPos = -1;
393 double xdif = 5000;
394 double ydif = 5000;
395 const int nCount = CountChars();
396 for (pos = 0; pos < nCount; ++pos) {
397 const CFX_FloatRect& orig_charrect = m_CharList[pos].m_CharBox;
398 if (orig_charrect.Contains(point))
399 break;
400
401 if (tolerance.width <= 0 && tolerance.height <= 0)
402 continue;
403
404 CFX_FloatRect charrect = orig_charrect;
405 charrect.Normalize();
406 CFX_FloatRect char_rect_ext(charrect.left - tolerance.width / 2,
407 charrect.bottom - tolerance.height / 2,
408 charrect.right + tolerance.width / 2,
409 charrect.top + tolerance.height / 2);
410 if (!char_rect_ext.Contains(point))
411 continue;
412
413 double curXdif =
414 std::min(fabs(point.x - charrect.left), fabs(point.x - charrect.right));
415 double curYdif =
416 std::min(fabs(point.y - charrect.bottom), fabs(point.y - charrect.top));
417 if (curYdif + curXdif < xdif + ydif) {
418 ydif = curYdif;
419 xdif = curXdif;
420 NearPos = pos;
421 }
422 }
423 return pos < nCount ? pos : NearPos;
424 }
425
GetTextByPredicate(const std::function<bool (const CharInfo &)> & predicate) const426 WideString CPDF_TextPage::GetTextByPredicate(
427 const std::function<bool(const CharInfo&)>& predicate) const {
428 float posy = 0;
429 bool IsContainPreChar = false;
430 bool IsAddLineFeed = false;
431 WideString strText;
432 for (const auto& charinfo : m_CharList) {
433 if (predicate(charinfo)) {
434 if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
435 IsAddLineFeed) {
436 posy = charinfo.m_Origin.y;
437 if (!strText.IsEmpty())
438 strText += L"\r\n";
439 }
440 IsContainPreChar = true;
441 IsAddLineFeed = false;
442 if (charinfo.m_Unicode)
443 strText += charinfo.m_Unicode;
444 } else if (charinfo.m_Unicode == L' ') {
445 if (IsContainPreChar) {
446 strText += L' ';
447 IsContainPreChar = false;
448 IsAddLineFeed = false;
449 }
450 } else {
451 IsContainPreChar = false;
452 IsAddLineFeed = true;
453 }
454 }
455 return strText;
456 }
457
GetTextByRect(const CFX_FloatRect & rect) const458 WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
459 return GetTextByPredicate([&rect](const CharInfo& charinfo) {
460 return IsRectIntersect(rect, charinfo.m_CharBox);
461 });
462 }
463
GetTextByObject(const CPDF_TextObject * pTextObj) const464 WideString CPDF_TextPage::GetTextByObject(
465 const CPDF_TextObject* pTextObj) const {
466 return GetTextByPredicate([pTextObj](const CharInfo& charinfo) {
467 return charinfo.m_pTextObj == pTextObj;
468 });
469 }
470
GetCharInfo(size_t index) const471 const CPDF_TextPage::CharInfo& CPDF_TextPage::GetCharInfo(size_t index) const {
472 CHECK(index < m_CharList.size());
473 return m_CharList[index];
474 }
475
GetCharFontSize(size_t index) const476 float CPDF_TextPage::GetCharFontSize(size_t index) const {
477 CHECK(index < m_CharList.size());
478 return GetFontSize(m_CharList[index].m_pTextObj);
479 }
480
GetCharLooseBounds(size_t index) const481 CFX_FloatRect CPDF_TextPage::GetCharLooseBounds(size_t index) const {
482 return GetLooseBounds(GetCharInfo(index));
483 }
484
GetPageText(int start,int count) const485 WideString CPDF_TextPage::GetPageText(int start, int count) const {
486 if (start < 0 || start >= CountChars() || count <= 0 || m_CharList.empty() ||
487 m_TextBuf.IsEmpty()) {
488 return WideString();
489 }
490
491 const int count_chars = CountChars();
492 int text_start = TextIndexFromCharIndex(start);
493
494 // If the character at |start| is a non-printing character, then
495 // TextIndexFromCharIndex will return -1, so scan ahead to the first printing
496 // character.
497 while (text_start < 0) {
498 if (start >= count_chars)
499 return WideString();
500 start++;
501 text_start = TextIndexFromCharIndex(start);
502 }
503
504 count = std::min(count, count_chars - start);
505
506 int last = start + count - 1;
507 int text_last = TextIndexFromCharIndex(last);
508
509 // If the character at |last| is a non-printing character, then
510 // TextIndexFromCharIndex will return -1, so scan back to the last printing
511 // character.
512 while (text_last < 0) {
513 if (last < text_start)
514 return WideString();
515
516 last--;
517 text_last = TextIndexFromCharIndex(last);
518 }
519
520 if (text_last < text_start)
521 return WideString();
522
523 int text_count = text_last - text_start + 1;
524
525 return WideString(m_TextBuf.AsStringView().Substr(text_start, text_count));
526 }
527
CountRects(int start,int nCount)528 int CPDF_TextPage::CountRects(int start, int nCount) {
529 if (start < 0)
530 return -1;
531
532 m_SelRects = GetRectArray(start, nCount);
533 return fxcrt::CollectionSize<int>(m_SelRects);
534 }
535
GetRect(int rectIndex,CFX_FloatRect * pRect) const536 bool CPDF_TextPage::GetRect(int rectIndex, CFX_FloatRect* pRect) const {
537 if (!fxcrt::IndexInBounds(m_SelRects, rectIndex))
538 return false;
539
540 *pRect = m_SelRects[rectIndex];
541 return true;
542 }
543
FindTextlineFlowOrientation() const544 CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation()
545 const {
546 DCHECK_NE(m_pPage->GetPageObjectCount(), 0u);
547
548 const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth());
549 const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight());
550 if (nPageWidth <= 0 || nPageHeight <= 0)
551 return TextOrientation::kUnknown;
552
553 std::vector<bool> nHorizontalMask(nPageWidth);
554 std::vector<bool> nVerticalMask(nPageHeight);
555 float fLineHeight = 0.0f;
556 int32_t nStartH = nPageWidth;
557 int32_t nEndH = 0;
558 int32_t nStartV = nPageHeight;
559 int32_t nEndV = 0;
560 for (const auto& pPageObj : *m_pPage) {
561 if (!pPageObj->IsText())
562 continue;
563
564 int32_t minH = static_cast<int32_t>(
565 std::clamp<float>(pPageObj->GetRect().left, 0.0f, nPageWidth));
566 int32_t maxH = static_cast<int32_t>(
567 std::clamp<float>(pPageObj->GetRect().right, 0.0f, nPageWidth));
568 int32_t minV = static_cast<int32_t>(
569 std::clamp<float>(pPageObj->GetRect().bottom, 0.0f, nPageHeight));
570 int32_t maxV = static_cast<int32_t>(
571 std::clamp<float>(pPageObj->GetRect().top, 0.0f, nPageHeight));
572 if (minH >= maxH || minV >= maxV)
573 continue;
574
575 for (int32_t i = minH; i < maxH; ++i)
576 nHorizontalMask[i] = true;
577 for (int32_t i = minV; i < maxV; ++i)
578 nVerticalMask[i] = true;
579
580 nStartH = std::min(nStartH, minH);
581 nEndH = std::max(nEndH, maxH);
582 nStartV = std::min(nStartV, minV);
583 nEndV = std::max(nEndV, maxV);
584
585 if (fLineHeight <= 0.0f)
586 fLineHeight = pPageObj->GetRect().Height();
587 }
588 const int32_t nDoubleLineHeight = 2 * fLineHeight;
589 if ((nEndV - nStartV) < nDoubleLineHeight)
590 return TextOrientation::kHorizontal;
591 if ((nEndH - nStartH) < nDoubleLineHeight)
592 return TextOrientation::kVertical;
593
594 const float nSumH = MaskPercentFilled(nHorizontalMask, nStartH, nEndH);
595 if (nSumH > 0.8f)
596 return TextOrientation::kHorizontal;
597
598 const float nSumV = MaskPercentFilled(nVerticalMask, nStartV, nEndV);
599 if (nSumH > nSumV)
600 return TextOrientation::kHorizontal;
601 if (nSumH < nSumV)
602 return TextOrientation::kVertical;
603 return TextOrientation::kUnknown;
604 }
605
AppendGeneratedCharacter(wchar_t unicode,const CFX_Matrix & formMatrix)606 void CPDF_TextPage::AppendGeneratedCharacter(wchar_t unicode,
607 const CFX_Matrix& formMatrix) {
608 absl::optional<CharInfo> pGenerateChar = GenerateCharInfo(unicode);
609 if (!pGenerateChar.has_value())
610 return;
611
612 m_TextBuf.AppendChar(unicode);
613 if (!formMatrix.IsIdentity())
614 pGenerateChar->m_Matrix = formMatrix;
615 m_CharList.push_back(pGenerateChar.value());
616 }
617
ProcessObject()618 void CPDF_TextPage::ProcessObject() {
619 if (m_pPage->GetPageObjectCount() == 0)
620 return;
621
622 m_TextlineDir = FindTextlineFlowOrientation();
623 for (auto it = m_pPage->begin(); it != m_pPage->end(); ++it) {
624 CPDF_PageObject* pObj = it->get();
625 if (!pObj)
626 continue;
627
628 CFX_Matrix matrix;
629 if (pObj->IsText())
630 ProcessTextObject(pObj->AsText(), matrix, m_pPage, it);
631 else if (pObj->IsForm())
632 ProcessFormObject(pObj->AsForm(), matrix);
633 }
634 for (const auto& obj : mTextObjects)
635 ProcessTextObject(obj);
636
637 mTextObjects.clear();
638 CloseTempLine();
639 }
640
ProcessFormObject(CPDF_FormObject * pFormObj,const CFX_Matrix & formMatrix)641 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
642 const CFX_Matrix& formMatrix) {
643 CFX_Matrix curFormMatrix = pFormObj->form_matrix() * formMatrix;
644 const CPDF_PageObjectHolder* pHolder = pFormObj->form();
645 for (auto it = pHolder->begin(); it != pHolder->end(); ++it) {
646 CPDF_PageObject* pPageObj = it->get();
647 if (!pPageObj)
648 continue;
649
650 if (pPageObj->IsText())
651 ProcessTextObject(pPageObj->AsText(), curFormMatrix, pHolder, it);
652 else if (pPageObj->IsForm())
653 ProcessFormObject(pPageObj->AsForm(), curFormMatrix);
654 }
655 }
656
AddCharInfoByLRDirection(wchar_t wChar,const CharInfo & info)657 void CPDF_TextPage::AddCharInfoByLRDirection(wchar_t wChar,
658 const CharInfo& info) {
659 CharInfo info2 = info;
660 if (IsControlChar(info2)) {
661 info2.m_Index = -1;
662 m_CharList.push_back(info2);
663 return;
664 }
665 info2.m_Index = m_TextBuf.GetLength();
666 DataVector<wchar_t> normalized;
667 if (wChar >= 0xFB00 && wChar <= 0xFB06)
668 normalized = GetUnicodeNormalization(wChar);
669 if (normalized.empty()) {
670 m_TextBuf.AppendChar(wChar);
671 m_CharList.push_back(info2);
672 return;
673 }
674 for (wchar_t normalized_char : normalized) {
675 info2.m_Unicode = normalized_char;
676 info2.m_CharType = CPDF_TextPage::CharType::kPiece;
677 m_TextBuf.AppendChar(info2.m_Unicode);
678 m_CharList.push_back(info2);
679 }
680 }
681
AddCharInfoByRLDirection(wchar_t wChar,const CharInfo & info)682 void CPDF_TextPage::AddCharInfoByRLDirection(wchar_t wChar,
683 const CharInfo& info) {
684 CharInfo info2 = info;
685 if (IsControlChar(info2)) {
686 info2.m_Index = -1;
687 m_CharList.push_back(info2);
688 return;
689 }
690 info2.m_Index = m_TextBuf.GetLength();
691 wChar = pdfium::unicode::GetMirrorChar(wChar);
692 DataVector<wchar_t> normalized = GetUnicodeNormalization(wChar);
693 if (normalized.empty()) {
694 info2.m_Unicode = wChar;
695 m_TextBuf.AppendChar(info2.m_Unicode);
696 m_CharList.push_back(info2);
697 return;
698 }
699 for (wchar_t normalized_char : normalized) {
700 info2.m_Unicode = normalized_char;
701 info2.m_CharType = CPDF_TextPage::CharType::kPiece;
702 m_TextBuf.AppendChar(info2.m_Unicode);
703 m_CharList.push_back(info2);
704 }
705 }
706
CloseTempLine()707 void CPDF_TextPage::CloseTempLine() {
708 if (m_TempCharList.empty())
709 return;
710
711 WideString str = m_TempTextBuf.MakeString();
712 bool bPrevSpace = false;
713 for (size_t i = 0; i < str.GetLength(); ++i) {
714 if (str[i] != ' ') {
715 bPrevSpace = false;
716 continue;
717 }
718 if (bPrevSpace) {
719 m_TempTextBuf.Delete(i, 1);
720 m_TempCharList.erase(m_TempCharList.begin() + i);
721 str.Delete(i);
722 --i;
723 }
724 bPrevSpace = true;
725 }
726 CFX_BidiString bidi(str);
727 if (m_rtl)
728 bidi.SetOverallDirectionRight();
729 CFX_BidiChar::Direction eCurrentDirection = bidi.OverallDirection();
730 for (const auto& segment : bidi) {
731 if (segment.direction == CFX_BidiChar::Direction::kRight ||
732 (segment.direction == CFX_BidiChar::Direction::kNeutral &&
733 eCurrentDirection == CFX_BidiChar::Direction::kRight)) {
734 eCurrentDirection = CFX_BidiChar::Direction::kRight;
735 for (int m = segment.start + segment.count; m > segment.start; --m)
736 AddCharInfoByRLDirection(str[m - 1], m_TempCharList[m - 1]);
737 } else {
738 if (segment.direction != CFX_BidiChar::Direction::kLeftWeak) {
739 eCurrentDirection = CFX_BidiChar::Direction::kLeft;
740 }
741 for (int m = segment.start; m < segment.start + segment.count; ++m)
742 AddCharInfoByLRDirection(str[m], m_TempCharList[m]);
743 }
744 }
745 m_TempCharList.clear();
746 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
747 }
748
ProcessTextObject(CPDF_TextObject * pTextObj,const CFX_Matrix & formMatrix,const CPDF_PageObjectHolder * pObjList,CPDF_PageObjectHolder::const_iterator ObjPos)749 void CPDF_TextPage::ProcessTextObject(
750 CPDF_TextObject* pTextObj,
751 const CFX_Matrix& formMatrix,
752 const CPDF_PageObjectHolder* pObjList,
753 CPDF_PageObjectHolder::const_iterator ObjPos) {
754 if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon)
755 return;
756
757 size_t count = mTextObjects.size();
758 TransformedTextObject new_obj;
759 new_obj.m_pTextObj = pTextObj;
760 new_obj.m_formMatrix = formMatrix;
761 if (count == 0) {
762 mTextObjects.push_back(new_obj);
763 return;
764 }
765 if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos))
766 return;
767
768 TransformedTextObject prev_obj = mTextObjects[count - 1];
769 size_t nItem = prev_obj.m_pTextObj->CountItems();
770 if (nItem == 0)
771 return;
772
773 CPDF_TextObject::Item item = prev_obj.m_pTextObj->GetItemInfo(nItem - 1);
774 float prev_width =
775 GetCharWidth(item.m_CharCode, prev_obj.m_pTextObj->GetFont().Get()) *
776 prev_obj.m_pTextObj->GetFontSize() / 1000;
777
778 CFX_Matrix prev_matrix =
779 prev_obj.m_pTextObj->GetTextMatrix() * prev_obj.m_formMatrix;
780 prev_width = prev_matrix.TransformDistance(fabs(prev_width));
781 item = pTextObj->GetItemInfo(0);
782 float this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont().Get()) *
783 pTextObj->GetFontSize() / 1000;
784 this_width = fabs(this_width);
785
786 CFX_Matrix this_matrix = pTextObj->GetTextMatrix() * formMatrix;
787 this_width = this_matrix.TransformDistance(fabs(this_width));
788
789 float threshold = std::max(prev_width, this_width) / 4;
790 CFX_PointF prev_pos = m_DisplayMatrix.Transform(
791 prev_obj.m_formMatrix.Transform(prev_obj.m_pTextObj->GetPos()));
792 CFX_PointF this_pos =
793 m_DisplayMatrix.Transform(formMatrix.Transform(pTextObj->GetPos()));
794 if (fabs(this_pos.y - prev_pos.y) > threshold * 2) {
795 for (size_t i = 0; i < count; ++i)
796 ProcessTextObject(mTextObjects[i]);
797 mTextObjects.clear();
798 mTextObjects.push_back(new_obj);
799 return;
800 }
801
802 for (size_t i = count; i > 0; --i) {
803 TransformedTextObject prev_text_obj = mTextObjects[i - 1];
804 CFX_PointF new_prev_pos =
805 m_DisplayMatrix.Transform(prev_text_obj.m_formMatrix.Transform(
806 prev_text_obj.m_pTextObj->GetPos()));
807 if (this_pos.x >= new_prev_pos.x) {
808 mTextObjects.insert(mTextObjects.begin() + i, new_obj);
809 return;
810 }
811 }
812 mTextObjects.insert(mTextObjects.begin(), new_obj);
813 }
814
PreMarkedContent(const CPDF_TextObject * pTextObj)815 CPDF_TextPage::MarkedContentState CPDF_TextPage::PreMarkedContent(
816 const CPDF_TextObject* pTextObj) {
817 const CPDF_ContentMarks* pMarks = pTextObj->GetContentMarks();
818 const size_t nContentMarks = pMarks->CountItems();
819 if (nContentMarks == 0)
820 return MarkedContentState::kPass;
821
822 WideString actText;
823 bool bExist = false;
824 RetainPtr<const CPDF_Dictionary> pDict;
825 for (size_t i = 0; i < nContentMarks; ++i) {
826 const CPDF_ContentMarkItem* item = pMarks->GetItem(i);
827 pDict = item->GetParam();
828 if (!pDict)
829 continue;
830 RetainPtr<const CPDF_String> temp = pDict->GetStringFor("ActualText");
831 if (temp) {
832 bExist = true;
833 actText = temp->GetUnicodeText();
834 }
835 }
836 if (!bExist)
837 return MarkedContentState::kPass;
838
839 if (m_pPrevTextObj) {
840 const CPDF_ContentMarks* pPrevMarks = m_pPrevTextObj->GetContentMarks();
841 if (pPrevMarks->CountItems() == nContentMarks &&
842 pPrevMarks->GetItem(nContentMarks - 1)->GetParam() == pDict) {
843 return MarkedContentState::kDone;
844 }
845 }
846
847 if (actText.IsEmpty())
848 return MarkedContentState::kPass;
849
850 RetainPtr<CPDF_Font> pFont = pTextObj->GetFont();
851 bExist = false;
852 for (size_t i = 0; i < actText.GetLength(); ++i) {
853 if (pFont->CharCodeFromUnicode(actText[i]) != CPDF_Font::kInvalidCharCode) {
854 bExist = true;
855 break;
856 }
857 }
858 if (!bExist)
859 return MarkedContentState::kPass;
860
861 bExist = false;
862 for (size_t i = 0; i < actText.GetLength(); ++i) {
863 wchar_t wChar = actText[i];
864 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
865 bExist = true;
866 break;
867 }
868 }
869 if (!bExist)
870 return MarkedContentState::kDone;
871
872 return MarkedContentState::kDelay;
873 }
874
ProcessMarkedContent(const TransformedTextObject & obj)875 void CPDF_TextPage::ProcessMarkedContent(const TransformedTextObject& obj) {
876 const CPDF_TextObject* pTextObj = obj.m_pTextObj;
877 const CPDF_ContentMarks* pMarks = pTextObj->GetContentMarks();
878 const size_t nContentMarks = pMarks->CountItems();
879 WideString actText;
880 for (size_t n = 0; n < nContentMarks; ++n) {
881 const CPDF_ContentMarkItem* item = pMarks->GetItem(n);
882 RetainPtr<const CPDF_Dictionary> pDict = item->GetParam();
883 if (pDict)
884 actText = pDict->GetUnicodeTextFor("ActualText");
885 }
886 if (actText.IsEmpty())
887 return;
888
889 RetainPtr<CPDF_Font> pFont = pTextObj->GetFont();
890 CFX_Matrix matrix = pTextObj->GetTextMatrix() * obj.m_formMatrix;
891
892 for (size_t k = 0; k < actText.GetLength(); ++k) {
893 wchar_t wChar = actText[k];
894 if (wChar <= 0x80 && !isprint(wChar))
895 wChar = 0x20;
896 if (wChar >= 0xFFFD)
897 continue;
898
899 CharInfo charinfo;
900 charinfo.m_Origin = pTextObj->GetPos();
901 charinfo.m_Index = m_TextBuf.GetLength();
902 charinfo.m_Unicode = wChar;
903 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
904 charinfo.m_CharType = CPDF_TextPage::CharType::kPiece;
905 charinfo.m_pTextObj = pTextObj;
906 charinfo.m_CharBox = pTextObj->GetRect();
907 charinfo.m_Matrix = matrix;
908 m_TempTextBuf.AppendChar(wChar);
909 m_TempCharList.push_back(charinfo);
910 }
911 }
912
FindPreviousTextObject()913 void CPDF_TextPage::FindPreviousTextObject() {
914 const CharInfo* pPrevCharInfo = GetPrevCharInfo();
915 if (!pPrevCharInfo)
916 return;
917
918 if (pPrevCharInfo->m_pTextObj)
919 m_pPrevTextObj = pPrevCharInfo->m_pTextObj;
920 }
921
SwapTempTextBuf(size_t iCharListStartAppend,size_t iBufStartAppend)922 void CPDF_TextPage::SwapTempTextBuf(size_t iCharListStartAppend,
923 size_t iBufStartAppend) {
924 DCHECK(!m_TempCharList.empty());
925 if (iCharListStartAppend < m_TempCharList.size()) {
926 auto fwd = m_TempCharList.begin() + iCharListStartAppend;
927 auto rev = m_TempCharList.end() - 1;
928 for (; fwd < rev; ++fwd, --rev) {
929 std::swap(*fwd, *rev);
930 std::swap(fwd->m_Index, rev->m_Index);
931 }
932 }
933 pdfium::span<wchar_t> temp_span = m_TempTextBuf.GetWideSpan();
934 DCHECK(!temp_span.empty());
935 if (iBufStartAppend < temp_span.size()) {
936 std::reverse(temp_span.begin() + iBufStartAppend, temp_span.end());
937 }
938 }
939
ProcessTextObject(const TransformedTextObject & obj)940 void CPDF_TextPage::ProcessTextObject(const TransformedTextObject& obj) {
941 const CPDF_TextObject* pTextObj = obj.m_pTextObj;
942 if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon)
943 return;
944
945 CFX_Matrix form_matrix = obj.m_formMatrix;
946 RetainPtr<CPDF_Font> pFont = pTextObj->GetFont();
947 CFX_Matrix matrix = pTextObj->GetTextMatrix() * form_matrix;
948 MarkedContentState ePreMKC = PreMarkedContent(obj.m_pTextObj);
949 if (ePreMKC == MarkedContentState::kDone) {
950 m_pPrevTextObj = pTextObj;
951 m_PrevMatrix = form_matrix;
952 return;
953 }
954 GenerateCharacter result = GenerateCharacter::kNone;
955 if (m_pPrevTextObj) {
956 result = ProcessInsertObject(pTextObj, form_matrix);
957 if (result == GenerateCharacter::kLineBreak)
958 m_CurlineRect = pTextObj->GetRect();
959 else
960 m_CurlineRect.Union(obj.m_pTextObj->GetRect());
961
962 switch (result) {
963 case GenerateCharacter::kNone:
964 break;
965 case GenerateCharacter::kSpace: {
966 absl::optional<CharInfo> pGenerateChar = GenerateCharInfo(L' ');
967 if (pGenerateChar.has_value()) {
968 if (!form_matrix.IsIdentity())
969 pGenerateChar->m_Matrix = form_matrix;
970 m_TempTextBuf.AppendChar(L' ');
971 m_TempCharList.push_back(pGenerateChar.value());
972 }
973 break;
974 }
975 case GenerateCharacter::kLineBreak:
976 CloseTempLine();
977 if (m_TextBuf.GetSize()) {
978 AppendGeneratedCharacter(L'\r', form_matrix);
979 AppendGeneratedCharacter(L'\n', form_matrix);
980 }
981 break;
982 case GenerateCharacter::kHyphen:
983 if (pTextObj->CountChars() == 1) {
984 CPDF_TextObject::Item item = pTextObj->GetCharInfo(0);
985 WideString wstrItem =
986 pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
987 if (wstrItem.IsEmpty())
988 wstrItem += (wchar_t)item.m_CharCode;
989 wchar_t curChar = wstrItem[0];
990 if (IsHyphenCode(curChar))
991 return;
992 }
993 while (m_TempTextBuf.GetSize() > 0 &&
994 m_TempTextBuf.AsStringView().Back() == 0x20) {
995 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
996 m_TempCharList.pop_back();
997 }
998 CharInfo* charinfo = &m_TempCharList.back();
999 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1000 charinfo->m_Unicode = 0x2;
1001 charinfo->m_CharType = CPDF_TextPage::CharType::kHyphen;
1002 m_TempTextBuf.AppendChar(0xfffe);
1003 break;
1004 }
1005 } else {
1006 m_CurlineRect = pTextObj->GetRect();
1007 }
1008
1009 if (ePreMKC == MarkedContentState::kDelay) {
1010 ProcessMarkedContent(obj);
1011 m_pPrevTextObj = pTextObj;
1012 m_PrevMatrix = form_matrix;
1013 return;
1014 }
1015 m_pPrevTextObj = pTextObj;
1016 m_PrevMatrix = form_matrix;
1017 float baseSpace = CalculateBaseSpace(pTextObj, matrix);
1018
1019 const bool bR2L = IsRightToLeft(*pTextObj, *pFont);
1020 const bool bIsBidiAndMirrorInverse =
1021 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1022 const size_t iBufStartAppend = m_TempTextBuf.GetLength();
1023 const size_t iCharListStartAppend = m_TempCharList.size();
1024
1025 float spacing = 0;
1026 const size_t nItems = pTextObj->CountItems();
1027 for (size_t i = 0; i < nItems; ++i) {
1028 CharInfo charinfo;
1029 CPDF_TextObject::Item item = pTextObj->GetItemInfo(i);
1030 if (item.m_CharCode == 0xffffffff) {
1031 WideString str = m_TempTextBuf.MakeString();
1032 if (str.IsEmpty())
1033 str = m_TextBuf.AsStringView();
1034 if (str.IsEmpty() || str.Back() == L' ')
1035 continue;
1036
1037 float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1038 spacing = -fontsize_h * item.m_Origin.x / 1000;
1039 continue;
1040 }
1041 float charSpace = pTextObj->m_TextState.GetCharSpace();
1042 if (charSpace > 0.001)
1043 spacing += matrix.TransformDistance(charSpace);
1044 else if (charSpace < -0.001)
1045 spacing -= matrix.TransformDistance(fabs(charSpace));
1046 spacing -= baseSpace;
1047 if (spacing && i > 0) {
1048 float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1049 uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
1050 float threshold = 0;
1051 if (space_charcode != CPDF_Font::kInvalidCharCode)
1052 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1053 if (threshold > fontsize_h / 3)
1054 threshold = 0;
1055 else
1056 threshold /= 2;
1057 if (threshold == 0) {
1058 threshold = GetCharWidth(item.m_CharCode, pFont.Get());
1059 threshold = NormalizeThreshold(threshold, 300, 500, 700);
1060 threshold = fontsize_h * threshold / 1000;
1061 }
1062 if (threshold && (spacing && spacing >= threshold)) {
1063 charinfo.m_Unicode = L' ';
1064 charinfo.m_CharType = CPDF_TextPage::CharType::kGenerated;
1065 charinfo.m_pTextObj = pTextObj;
1066 charinfo.m_Index = m_TextBuf.GetLength();
1067 m_TempTextBuf.AppendChar(L' ');
1068 charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
1069 charinfo.m_Matrix = form_matrix;
1070 charinfo.m_Origin = matrix.Transform(item.m_Origin);
1071 charinfo.m_CharBox =
1072 CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
1073 charinfo.m_Origin.x, charinfo.m_Origin.y);
1074 m_TempCharList.push_back(charinfo);
1075 }
1076 if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
1077 continue;
1078 }
1079 spacing = 0;
1080 WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1081 bool bNoUnicode = false;
1082 if (wstrItem.IsEmpty() && item.m_CharCode) {
1083 wstrItem += static_cast<wchar_t>(item.m_CharCode);
1084 bNoUnicode = true;
1085 }
1086 charinfo.m_Index = -1;
1087 charinfo.m_CharCode = item.m_CharCode;
1088 charinfo.m_CharType = bNoUnicode ? CPDF_TextPage::CharType::kNotUnicode
1089 : CPDF_TextPage::CharType::kNormal;
1090 charinfo.m_pTextObj = pTextObj;
1091 charinfo.m_Origin = matrix.Transform(item.m_Origin);
1092
1093 const FX_RECT rect =
1094 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
1095 const float fFontSize = pTextObj->GetFontSize() / 1000;
1096 charinfo.m_CharBox.top = rect.top * fFontSize + item.m_Origin.y;
1097 charinfo.m_CharBox.left = rect.left * fFontSize + item.m_Origin.x;
1098 charinfo.m_CharBox.right = rect.right * fFontSize + item.m_Origin.x;
1099 charinfo.m_CharBox.bottom = rect.bottom * fFontSize + item.m_Origin.y;
1100 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) <
1101 kSizeEpsilon) {
1102 charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + fFontSize;
1103 }
1104 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) <
1105 kSizeEpsilon) {
1106 charinfo.m_CharBox.right =
1107 charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1108 }
1109 charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox);
1110 charinfo.m_Matrix = matrix;
1111 if (wstrItem.IsEmpty()) {
1112 charinfo.m_Unicode = 0;
1113 m_TempCharList.push_back(charinfo);
1114 m_TempTextBuf.AppendChar(0xfffe);
1115 continue;
1116 }
1117 size_t nTotal = wstrItem.GetLength();
1118 bool bDel = false;
1119 const int count = std::min(fxcrt::CollectionSize<int>(m_TempCharList), 7);
1120 constexpr float kTextCharRatioGapDelta = 0.07f;
1121 float threshold = charinfo.m_Matrix.TransformXDistance(
1122 kTextCharRatioGapDelta * pTextObj->GetFontSize());
1123 for (int n = fxcrt::CollectionSize<int>(m_TempCharList);
1124 n > fxcrt::CollectionSize<int>(m_TempCharList) - count; --n) {
1125 const CharInfo& charinfo1 = m_TempCharList[n - 1];
1126 CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
1127 if (charinfo1.m_CharCode == charinfo.m_CharCode &&
1128 charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
1129 fabs(diff.x) < threshold && fabs(diff.y) < threshold) {
1130 bDel = true;
1131 break;
1132 }
1133 }
1134 if (!bDel) {
1135 for (size_t nIndex = 0; nIndex < nTotal; ++nIndex) {
1136 charinfo.m_Unicode = wstrItem[nIndex];
1137 if (charinfo.m_Unicode) {
1138 charinfo.m_Index = m_TextBuf.GetLength();
1139 m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1140 } else {
1141 m_TempTextBuf.AppendChar(0xfffe);
1142 }
1143 m_TempCharList.push_back(charinfo);
1144 }
1145 } else if (i == 0) {
1146 WideString str = m_TempTextBuf.MakeString();
1147 if (!str.IsEmpty() && str.Back() == L' ') {
1148 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1149 m_TempCharList.pop_back();
1150 }
1151 }
1152 }
1153 if (bIsBidiAndMirrorInverse)
1154 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1155 }
1156
GetTextObjectWritingMode(const CPDF_TextObject * pTextObj) const1157 CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode(
1158 const CPDF_TextObject* pTextObj) const {
1159 size_t nChars = pTextObj->CountChars();
1160 if (nChars <= 1)
1161 return m_TextlineDir;
1162
1163 CPDF_TextObject::Item first = pTextObj->GetCharInfo(0);
1164 CPDF_TextObject::Item last = pTextObj->GetCharInfo(nChars - 1);
1165 CFX_Matrix textMatrix = pTextObj->GetTextMatrix();
1166 first.m_Origin = textMatrix.Transform(first.m_Origin);
1167 last.m_Origin = textMatrix.Transform(last.m_Origin);
1168
1169 static constexpr float kEpsilon = 0.0001f;
1170 float dX = fabs(last.m_Origin.x - first.m_Origin.x);
1171 float dY = fabs(last.m_Origin.y - first.m_Origin.y);
1172 if (dX <= kEpsilon && dY <= kEpsilon)
1173 return TextOrientation::kUnknown;
1174
1175 static constexpr float kThreshold = 0.0872f;
1176 CFX_VectorF v(dX, dY);
1177 v.Normalize();
1178 bool bXUnderThreshold = v.x <= kThreshold;
1179 if (v.y <= kThreshold)
1180 return bXUnderThreshold ? m_TextlineDir : TextOrientation::kHorizontal;
1181 return bXUnderThreshold ? TextOrientation::kVertical : m_TextlineDir;
1182 }
1183
IsHyphen(wchar_t curChar) const1184 bool CPDF_TextPage::IsHyphen(wchar_t curChar) const {
1185 WideStringView curText = m_TempTextBuf.AsStringView();
1186 if (curText.IsEmpty())
1187 curText = m_TextBuf.AsStringView();
1188
1189 if (curText.IsEmpty())
1190 return false;
1191
1192 auto iter = curText.rbegin();
1193 for (; (iter + 1) != curText.rend() && *iter == 0x20; ++iter) {
1194 // Do nothing
1195 }
1196
1197 if (!IsHyphenCode(*iter))
1198 return false;
1199
1200 if ((iter + 1) != curText.rend()) {
1201 iter++;
1202 if (FXSYS_iswalpha(*iter) && FXSYS_iswalnum(curChar))
1203 return true;
1204 }
1205
1206 const CharInfo* pPrevCharInfo = GetPrevCharInfo();
1207 return pPrevCharInfo &&
1208 pPrevCharInfo->m_CharType == CPDF_TextPage::CharType::kPiece &&
1209 IsHyphenCode(pPrevCharInfo->m_Unicode);
1210 }
1211
GetPrevCharInfo() const1212 const CPDF_TextPage::CharInfo* CPDF_TextPage::GetPrevCharInfo() const {
1213 if (!m_TempCharList.empty())
1214 return &m_TempCharList.back();
1215 return !m_CharList.empty() ? &m_CharList.back() : nullptr;
1216 }
1217
ProcessInsertObject(const CPDF_TextObject * pObj,const CFX_Matrix & formMatrix)1218 CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject(
1219 const CPDF_TextObject* pObj,
1220 const CFX_Matrix& formMatrix) {
1221 FindPreviousTextObject();
1222 TextOrientation WritingMode = GetTextObjectWritingMode(pObj);
1223 if (WritingMode == TextOrientation::kUnknown)
1224 WritingMode = GetTextObjectWritingMode(m_pPrevTextObj);
1225
1226 size_t nItem = m_pPrevTextObj->CountItems();
1227 if (nItem == 0)
1228 return GenerateCharacter::kNone;
1229
1230 CPDF_TextObject::Item PrevItem = m_pPrevTextObj->GetItemInfo(nItem - 1);
1231 CPDF_TextObject::Item item = pObj->GetItemInfo(0);
1232 const CFX_FloatRect& this_rect = pObj->GetRect();
1233 const CFX_FloatRect& prev_rect = m_pPrevTextObj->GetRect();
1234 WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1235 if (wstrItem.IsEmpty())
1236 wstrItem += static_cast<wchar_t>(item.m_CharCode);
1237
1238 wchar_t curChar = wstrItem[0];
1239 if (WritingMode == TextOrientation::kHorizontal) {
1240 if (EndHorizontalLine(this_rect, prev_rect)) {
1241 return IsHyphen(curChar) ? GenerateCharacter::kHyphen
1242 : GenerateCharacter::kLineBreak;
1243 }
1244 } else if (WritingMode == TextOrientation::kVertical) {
1245 if (EndVerticalLine(this_rect, prev_rect, m_CurlineRect,
1246 pObj->GetFontSize(), m_pPrevTextObj->GetFontSize())) {
1247 return IsHyphen(curChar) ? GenerateCharacter::kHyphen
1248 : GenerateCharacter::kLineBreak;
1249 }
1250 }
1251
1252 float last_pos = PrevItem.m_Origin.x;
1253 int nLastWidth =
1254 GetCharWidth(PrevItem.m_CharCode, m_pPrevTextObj->GetFont().Get());
1255 float last_width = nLastWidth * m_pPrevTextObj->GetFontSize() / 1000;
1256 last_width = fabs(last_width);
1257 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont().Get());
1258 float this_width = fabs(nThisWidth * pObj->GetFontSize() / 1000);
1259 float threshold = std::max(last_width, this_width) / 4;
1260
1261 CFX_Matrix prev_matrix = m_pPrevTextObj->GetTextMatrix() * m_PrevMatrix;
1262 CFX_Matrix prev_reverse = prev_matrix.GetInverse();
1263
1264 CFX_PointF pos = prev_reverse.Transform(formMatrix.Transform(pObj->GetPos()));
1265 if (last_width < this_width)
1266 threshold = prev_reverse.TransformDistance(threshold);
1267
1268 bool bNewline = false;
1269 if (WritingMode == TextOrientation::kHorizontal) {
1270 CFX_FloatRect rect = m_pPrevTextObj->GetRect();
1271 float rect_height = rect.Height();
1272 rect.Normalize();
1273 if ((rect.IsEmpty() && rect_height > 5) ||
1274 ((pos.y > threshold * 2 || pos.y < threshold * -3) &&
1275 (fabs(pos.y) >= 1 || fabs(pos.y) > fabs(pos.x)))) {
1276 bNewline = true;
1277 if (nItem > 1) {
1278 CPDF_TextObject::Item tempItem = m_pPrevTextObj->GetItemInfo(0);
1279 CFX_Matrix m = m_pPrevTextObj->GetTextMatrix();
1280 if (PrevItem.m_Origin.x > tempItem.m_Origin.x &&
1281 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1282 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1283 m.c < 0.1) {
1284 CFX_FloatRect re(0, m_pPrevTextObj->GetRect().bottom, 1000,
1285 m_pPrevTextObj->GetRect().top);
1286 if (re.Contains(pObj->GetPos())) {
1287 bNewline = false;
1288 } else {
1289 if (CFX_FloatRect(0, pObj->GetRect().bottom, 1000,
1290 pObj->GetRect().top)
1291 .Contains(m_pPrevTextObj->GetPos())) {
1292 bNewline = false;
1293 }
1294 }
1295 }
1296 }
1297 }
1298 }
1299 if (bNewline) {
1300 return IsHyphen(curChar) ? GenerateCharacter::kHyphen
1301 : GenerateCharacter::kLineBreak;
1302 }
1303
1304 if (pObj->CountChars() == 1 && IsHyphenCode(curChar) && IsHyphen(curChar))
1305 return GenerateCharacter::kHyphen;
1306
1307 if (curChar == L' ')
1308 return GenerateCharacter::kNone;
1309
1310 WideString PrevStr =
1311 m_pPrevTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1312 wchar_t preChar = PrevStr.Back();
1313 if (preChar == L' ')
1314 return GenerateCharacter::kNone;
1315
1316 CFX_Matrix matrix = pObj->GetTextMatrix() * formMatrix;
1317 float threshold2 = std::max(nLastWidth, nThisWidth);
1318 threshold2 = NormalizeThreshold(threshold2, 400, 700, 800);
1319 if (nLastWidth >= nThisWidth) {
1320 threshold2 *= fabs(m_pPrevTextObj->GetFontSize());
1321 } else {
1322 threshold2 *= fabs(pObj->GetFontSize());
1323 threshold2 = matrix.TransformDistance(threshold2);
1324 threshold2 = prev_reverse.TransformDistance(threshold2);
1325 }
1326 threshold2 /= 1000;
1327 if ((threshold2 < 1.4881 && threshold2 > 1.4879) ||
1328 (threshold2 < 1.39001 && threshold2 > 1.38999)) {
1329 threshold2 *= 1.5;
1330 }
1331 return GenerateSpace(pos, last_pos, this_width, last_width, threshold2)
1332 ? GenerateCharacter::kSpace
1333 : GenerateCharacter::kNone;
1334 }
1335
IsSameTextObject(CPDF_TextObject * pTextObj1,CPDF_TextObject * pTextObj2) const1336 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
1337 CPDF_TextObject* pTextObj2) const {
1338 if (!pTextObj1 || !pTextObj2)
1339 return false;
1340
1341 CFX_FloatRect rcPreObj = pTextObj2->GetRect();
1342 const CFX_FloatRect& rcCurObj = pTextObj1->GetRect();
1343 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
1344 float dbXdif = fabs(rcPreObj.left - rcCurObj.left);
1345 size_t nCount = m_CharList.size();
1346 if (nCount >= 2) {
1347 float dbSpace = m_CharList[nCount - 2].m_CharBox.Width();
1348 if (dbXdif > dbSpace)
1349 return false;
1350 }
1351 }
1352 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
1353 rcPreObj.Intersect(rcCurObj);
1354 if (rcPreObj.IsEmpty())
1355 return false;
1356 if (fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
1357 return false;
1358 }
1359 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize())
1360 return false;
1361 }
1362
1363 size_t nPreCount = pTextObj2->CountItems();
1364 if (nPreCount != pTextObj1->CountItems())
1365 return false;
1366
1367 // If both objects have no items, consider them same.
1368 if (nPreCount == 0)
1369 return true;
1370
1371 CPDF_TextObject::Item itemPer;
1372 CPDF_TextObject::Item itemCur;
1373 for (size_t i = 0; i < nPreCount; ++i) {
1374 itemPer = pTextObj2->GetItemInfo(i);
1375 itemCur = pTextObj1->GetItemInfo(i);
1376 if (itemCur.m_CharCode != itemPer.m_CharCode)
1377 return false;
1378 }
1379
1380 CFX_PointF diff = pTextObj1->GetPos() - pTextObj2->GetPos();
1381 float font_size = pTextObj2->GetFontSize();
1382 float char_size =
1383 GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont().Get());
1384 float max_pre_size =
1385 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), font_size);
1386 return fabs(diff.x) <= 0.9 * char_size * font_size / 1000 &&
1387 fabs(diff.y) <= max_pre_size / 8;
1388 }
1389
IsSameAsPreTextObject(CPDF_TextObject * pTextObj,const CPDF_PageObjectHolder * pObjList,CPDF_PageObjectHolder::const_iterator iter) const1390 bool CPDF_TextPage::IsSameAsPreTextObject(
1391 CPDF_TextObject* pTextObj,
1392 const CPDF_PageObjectHolder* pObjList,
1393 CPDF_PageObjectHolder::const_iterator iter) const {
1394 int i = 0;
1395 while (i < 5 && iter != pObjList->begin()) {
1396 --iter;
1397 CPDF_PageObject* pOtherObj = iter->get();
1398 if (pOtherObj == pTextObj || !pOtherObj->IsText())
1399 continue;
1400 if (IsSameTextObject(pOtherObj->AsText(), pTextObj))
1401 return true;
1402 ++i;
1403 }
1404 return false;
1405 }
1406
GenerateCharInfo(wchar_t unicode)1407 absl::optional<CPDF_TextPage::CharInfo> CPDF_TextPage::GenerateCharInfo(
1408 wchar_t unicode) {
1409 const CharInfo* pPrevCharInfo = GetPrevCharInfo();
1410 if (!pPrevCharInfo)
1411 return absl::nullopt;
1412
1413 CharInfo info;
1414 info.m_Index = m_TextBuf.GetLength();
1415 info.m_CharCode = CPDF_Font::kInvalidCharCode;
1416 info.m_Unicode = unicode;
1417 info.m_CharType = CPDF_TextPage::CharType::kGenerated;
1418
1419 int preWidth = 0;
1420 if (pPrevCharInfo->m_pTextObj &&
1421 pPrevCharInfo->m_CharCode != CPDF_Font::kInvalidCharCode) {
1422 preWidth = GetCharWidth(pPrevCharInfo->m_CharCode,
1423 pPrevCharInfo->m_pTextObj->GetFont().Get());
1424 }
1425
1426 float fFontSize = pPrevCharInfo->m_pTextObj
1427 ? pPrevCharInfo->m_pTextObj->GetFontSize()
1428 : pPrevCharInfo->m_CharBox.Height();
1429 if (!fFontSize)
1430 fFontSize = kDefaultFontSize;
1431
1432 info.m_Origin =
1433 CFX_PointF(pPrevCharInfo->m_Origin.x + preWidth * (fFontSize) / 1000,
1434 pPrevCharInfo->m_Origin.y);
1435 info.m_CharBox = CFX_FloatRect(info.m_Origin.x, info.m_Origin.y,
1436 info.m_Origin.x, info.m_Origin.y);
1437 return info;
1438 }
1439