1 // Copyright 2016 The PDFium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 9 10 #include <stdint.h> 11 12 #include <deque> 13 #include <functional> 14 #include <vector> 15 16 #include "core/fpdfapi/page/cpdf_pageobjectholder.h" 17 #include "core/fxcrt/data_vector.h" 18 #include "core/fxcrt/fx_coordinates.h" 19 #include "core/fxcrt/fx_memory_wrappers.h" 20 #include "core/fxcrt/unowned_ptr.h" 21 #include "core/fxcrt/widestring.h" 22 #include "core/fxcrt/widetext_buffer.h" 23 #include "third_party/abseil-cpp/absl/types/optional.h" 24 25 class CPDF_FormObject; 26 class CPDF_Page; 27 class CPDF_TextObject; 28 29 struct TextPageCharSegment { 30 int index; 31 int count; 32 }; 33 34 FX_DATA_PARTITION_EXCEPTION(TextPageCharSegment); 35 36 class CPDF_TextPage { 37 public: 38 enum class CharType : uint8_t { 39 kNormal, 40 kGenerated, 41 kNotUnicode, 42 kHyphen, 43 kPiece, 44 }; 45 46 class CharInfo { 47 public: 48 CharInfo(); 49 CharInfo(const CharInfo&); 50 ~CharInfo(); 51 52 int m_Index = 0; 53 uint32_t m_CharCode = 0; 54 wchar_t m_Unicode = 0; 55 CharType m_CharType = CharType::kNormal; 56 CFX_PointF m_Origin; 57 CFX_FloatRect m_CharBox; 58 UnownedPtr<const CPDF_TextObject> m_pTextObj; 59 CFX_Matrix m_Matrix; 60 }; 61 62 CPDF_TextPage(const CPDF_Page* pPage, bool rtl); 63 ~CPDF_TextPage(); 64 65 int CharIndexFromTextIndex(int text_index) const; 66 int TextIndexFromCharIndex(int char_index) const; size()67 size_t size() const { return m_CharList.size(); } 68 int CountChars() const; 69 70 // These methods CHECK() to make sure |index| is within bounds. 71 const CharInfo& GetCharInfo(size_t index) const; 72 float GetCharFontSize(size_t index) const; 73 CFX_FloatRect GetCharLooseBounds(size_t index) const; 74 75 std::vector<CFX_FloatRect> GetRectArray(int start, int count) const; 76 int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; 77 WideString GetTextByRect(const CFX_FloatRect& rect) const; 78 WideString GetTextByObject(const CPDF_TextObject* pTextObj) const; 79 80 // Returns string with the text from |m_TextBuf| that are covered by the input 81 // range. |start| and |count| are in terms of the |m_CharIndices|, so the 82 // range will be converted into appropriate indices. 83 WideString GetPageText(int start, int count) const; GetAllPageText()84 WideString GetAllPageText() const { return GetPageText(0, CountChars()); } 85 86 int CountRects(int start, int nCount); 87 bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; 88 89 private: 90 enum class TextOrientation { 91 kUnknown, 92 kHorizontal, 93 kVertical, 94 }; 95 96 enum class GenerateCharacter { 97 kNone, 98 kSpace, 99 kLineBreak, 100 kHyphen, 101 }; 102 103 enum class MarkedContentState { kPass = 0, kDone, kDelay }; 104 105 struct TransformedTextObject { 106 TransformedTextObject(); 107 TransformedTextObject(const TransformedTextObject& that); 108 ~TransformedTextObject(); 109 110 UnownedPtr<const CPDF_TextObject> m_pTextObj; 111 CFX_Matrix m_formMatrix; 112 }; 113 114 void Init(); 115 bool IsHyphen(wchar_t curChar) const; 116 void ProcessObject(); 117 void ProcessFormObject(CPDF_FormObject* pFormObj, 118 const CFX_Matrix& formMatrix); 119 void ProcessTextObject(const TransformedTextObject& obj); 120 void ProcessTextObject(CPDF_TextObject* pTextObj, 121 const CFX_Matrix& formMatrix, 122 const CPDF_PageObjectHolder* pObjList, 123 CPDF_PageObjectHolder::const_iterator ObjPos); 124 GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj, 125 const CFX_Matrix& formMatrix); 126 const CharInfo* GetPrevCharInfo() const; 127 absl::optional<CharInfo> GenerateCharInfo(wchar_t unicode); 128 bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj, 129 const CPDF_PageObjectHolder* pObjList, 130 CPDF_PageObjectHolder::const_iterator iter) const; 131 bool IsSameTextObject(CPDF_TextObject* pTextObj1, 132 CPDF_TextObject* pTextObj2) const; 133 void CloseTempLine(); 134 MarkedContentState PreMarkedContent(const CPDF_TextObject* pTextObj); 135 void ProcessMarkedContent(const TransformedTextObject& obj); 136 void FindPreviousTextObject(); 137 void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info); 138 void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info); 139 TextOrientation GetTextObjectWritingMode( 140 const CPDF_TextObject* pTextObj) const; 141 TextOrientation FindTextlineFlowOrientation() const; 142 void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix); 143 void SwapTempTextBuf(size_t iCharListStartAppend, size_t iBufStartAppend); 144 WideString GetTextByPredicate( 145 const std::function<bool(const CharInfo&)>& predicate) const; 146 147 UnownedPtr<const CPDF_Page> const m_pPage; 148 DataVector<TextPageCharSegment> m_CharIndices; 149 std::deque<CharInfo> m_CharList; 150 std::deque<CharInfo> m_TempCharList; 151 WideTextBuffer m_TextBuf; 152 WideTextBuffer m_TempTextBuf; 153 UnownedPtr<const CPDF_TextObject> m_pPrevTextObj; 154 CFX_Matrix m_PrevMatrix; 155 const bool m_rtl; 156 const CFX_Matrix m_DisplayMatrix; 157 std::vector<CFX_FloatRect> m_SelRects; 158 std::vector<TransformedTextObject> mTextObjects; 159 TextOrientation m_TextlineDir = TextOrientation::kUnknown; 160 CFX_FloatRect m_CurlineRect; 161 }; 162 163 #endif // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ 164