xref: /aosp_15_r20/external/pdfium/core/fpdftext/cpdf_textpage.h (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
9 
10 #include <stdint.h>
11 
12 #include <deque>
13 #include <functional>
14 #include <vector>
15 
16 #include "core/fpdfapi/page/cpdf_pageobjectholder.h"
17 #include "core/fxcrt/data_vector.h"
18 #include "core/fxcrt/fx_coordinates.h"
19 #include "core/fxcrt/fx_memory_wrappers.h"
20 #include "core/fxcrt/unowned_ptr.h"
21 #include "core/fxcrt/widestring.h"
22 #include "core/fxcrt/widetext_buffer.h"
23 #include "third_party/abseil-cpp/absl/types/optional.h"
24 
25 class CPDF_FormObject;
26 class CPDF_Page;
27 class CPDF_TextObject;
28 
29 struct TextPageCharSegment {
30   int index;
31   int count;
32 };
33 
34 FX_DATA_PARTITION_EXCEPTION(TextPageCharSegment);
35 
36 class CPDF_TextPage {
37  public:
38   enum class CharType : uint8_t {
39     kNormal,
40     kGenerated,
41     kNotUnicode,
42     kHyphen,
43     kPiece,
44   };
45 
46   class CharInfo {
47    public:
48     CharInfo();
49     CharInfo(const CharInfo&);
50     ~CharInfo();
51 
52     int m_Index = 0;
53     uint32_t m_CharCode = 0;
54     wchar_t m_Unicode = 0;
55     CharType m_CharType = CharType::kNormal;
56     CFX_PointF m_Origin;
57     CFX_FloatRect m_CharBox;
58     UnownedPtr<const CPDF_TextObject> m_pTextObj;
59     CFX_Matrix m_Matrix;
60   };
61 
62   CPDF_TextPage(const CPDF_Page* pPage, bool rtl);
63   ~CPDF_TextPage();
64 
65   int CharIndexFromTextIndex(int text_index) const;
66   int TextIndexFromCharIndex(int char_index) const;
size()67   size_t size() const { return m_CharList.size(); }
68   int CountChars() const;
69 
70   // These methods CHECK() to make sure |index| is within bounds.
71   const CharInfo& GetCharInfo(size_t index) const;
72   float GetCharFontSize(size_t index) const;
73   CFX_FloatRect GetCharLooseBounds(size_t index) const;
74 
75   std::vector<CFX_FloatRect> GetRectArray(int start, int count) const;
76   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
77   WideString GetTextByRect(const CFX_FloatRect& rect) const;
78   WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;
79 
80   // Returns string with the text from |m_TextBuf| that are covered by the input
81   // range. |start| and |count| are in terms of the |m_CharIndices|, so the
82   // range will be converted into appropriate indices.
83   WideString GetPageText(int start, int count) const;
GetAllPageText()84   WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
85 
86   int CountRects(int start, int nCount);
87   bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
88 
89  private:
90   enum class TextOrientation {
91     kUnknown,
92     kHorizontal,
93     kVertical,
94   };
95 
96   enum class GenerateCharacter {
97     kNone,
98     kSpace,
99     kLineBreak,
100     kHyphen,
101   };
102 
103   enum class MarkedContentState { kPass = 0, kDone, kDelay };
104 
105   struct TransformedTextObject {
106     TransformedTextObject();
107     TransformedTextObject(const TransformedTextObject& that);
108     ~TransformedTextObject();
109 
110     UnownedPtr<const CPDF_TextObject> m_pTextObj;
111     CFX_Matrix m_formMatrix;
112   };
113 
114   void Init();
115   bool IsHyphen(wchar_t curChar) const;
116   void ProcessObject();
117   void ProcessFormObject(CPDF_FormObject* pFormObj,
118                          const CFX_Matrix& formMatrix);
119   void ProcessTextObject(const TransformedTextObject& obj);
120   void ProcessTextObject(CPDF_TextObject* pTextObj,
121                          const CFX_Matrix& formMatrix,
122                          const CPDF_PageObjectHolder* pObjList,
123                          CPDF_PageObjectHolder::const_iterator ObjPos);
124   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
125                                         const CFX_Matrix& formMatrix);
126   const CharInfo* GetPrevCharInfo() const;
127   absl::optional<CharInfo> GenerateCharInfo(wchar_t unicode);
128   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
129                              const CPDF_PageObjectHolder* pObjList,
130                              CPDF_PageObjectHolder::const_iterator iter) const;
131   bool IsSameTextObject(CPDF_TextObject* pTextObj1,
132                         CPDF_TextObject* pTextObj2) const;
133   void CloseTempLine();
134   MarkedContentState PreMarkedContent(const CPDF_TextObject* pTextObj);
135   void ProcessMarkedContent(const TransformedTextObject& obj);
136   void FindPreviousTextObject();
137   void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info);
138   void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info);
139   TextOrientation GetTextObjectWritingMode(
140       const CPDF_TextObject* pTextObj) const;
141   TextOrientation FindTextlineFlowOrientation() const;
142   void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
143   void SwapTempTextBuf(size_t iCharListStartAppend, size_t iBufStartAppend);
144   WideString GetTextByPredicate(
145       const std::function<bool(const CharInfo&)>& predicate) const;
146 
147   UnownedPtr<const CPDF_Page> const m_pPage;
148   DataVector<TextPageCharSegment> m_CharIndices;
149   std::deque<CharInfo> m_CharList;
150   std::deque<CharInfo> m_TempCharList;
151   WideTextBuffer m_TextBuf;
152   WideTextBuffer m_TempTextBuf;
153   UnownedPtr<const CPDF_TextObject> m_pPrevTextObj;
154   CFX_Matrix m_PrevMatrix;
155   const bool m_rtl;
156   const CFX_Matrix m_DisplayMatrix;
157   std::vector<CFX_FloatRect> m_SelRects;
158   std::vector<TransformedTextObject> mTextObjects;
159   TextOrientation m_TextlineDir = TextOrientation::kUnknown;
160   CFX_FloatRect m_CurlineRect;
161 };
162 
163 #endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
164