xref: /aosp_15_r20/external/skia/src/pdf/SkPDFMakeToUnicodeCmap.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2011 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/pdf/SkPDFMakeToUnicodeCmap.h"
9 
10 #include "include/core/SkStream.h"
11 #include "include/core/SkString.h"
12 #include "include/private/base/SkTo.h"
13 #include "src/base/SkUTF.h"
14 #include "src/pdf/SkPDFGlyphUse.h"
15 #include "src/pdf/SkPDFUtils.h"
16 
17 #include <algorithm>
18 #include <cstddef>
19 #include <vector>
20 
21 using namespace skia_private;
22 
append_tounicode_header(SkDynamicMemoryWStream * cmap,bool multibyte)23 static void append_tounicode_header(SkDynamicMemoryWStream* cmap,
24                                     bool multibyte) {
25     // 12 dict begin: 12 is an Adobe-suggested value. Shall not change.
26     // It's there to prevent old version Adobe Readers from malfunctioning.
27     const char* kHeader =
28         "/CIDInit /ProcSet findresource begin\n"
29         "12 dict begin\n"
30         "begincmap\n";
31     cmap->writeText(kHeader);
32 
33     // The /CIDSystemInfo must be consistent to the one in
34     // SkPDFFont::populateCIDFont().
35     // We can not pass over the system info object here because the format is
36     // different. This is not a reference object.
37     const char* kSysInfo =
38         "/CIDSystemInfo\n"
39         "<<  /Registry (Adobe)\n"
40         "/Ordering (UCS)\n"
41         "/Supplement 0\n"
42         ">> def\n";
43     cmap->writeText(kSysInfo);
44 
45     // The CMapName must be consistent to /CIDSystemInfo above.
46     // /CMapType 2 means ToUnicode.
47     // Codespace range just tells the PDF processor the valid range.
48     const char* kTypeInfoHeader =
49         "/CMapName /Adobe-Identity-UCS def\n"
50         "/CMapType 2 def\n"
51         "1 begincodespacerange\n";
52     cmap->writeText(kTypeInfoHeader);
53     if (multibyte) {
54         cmap->writeText("<0000> <FFFF>\n");
55     } else {
56         cmap->writeText("<00> <FF>\n");
57     }
58     cmap->writeText("endcodespacerange\n");
59 }
60 
append_cmap_footer(SkDynamicMemoryWStream * cmap)61 static void append_cmap_footer(SkDynamicMemoryWStream* cmap) {
62     const char kFooter[] =
63         "endcmap\n"
64         "CMapName currentdict /CMap defineresource pop\n"
65         "end\n"
66         "end";
67     cmap->writeText(kFooter);
68 }
69 
70 namespace {
71 struct BFChar {
72     SkGlyphID fGlyphId;
73     SkUnichar fUnicode;
74 };
75 
76 struct BFRange {
77     SkGlyphID fStart;
78     SkGlyphID fEnd;
79     SkUnichar fUnicode;
80 };
81 }  // namespace
82 
write_glyph(SkDynamicMemoryWStream * cmap,bool multiByte,SkGlyphID gid)83 static void write_glyph(SkDynamicMemoryWStream* cmap,
84                         bool multiByte,
85                         SkGlyphID gid) {
86     if (multiByte) {
87         SkPDFUtils::WriteUInt16BE(cmap, gid);
88     } else {
89         SkPDFUtils::WriteUInt8(cmap, SkToU8(gid));
90     }
91 }
92 
append_bfchar_section(const std::vector<BFChar> & bfchar,bool multiByte,SkDynamicMemoryWStream * cmap)93 static void append_bfchar_section(const std::vector<BFChar>& bfchar,
94                                   bool multiByte,
95                                   SkDynamicMemoryWStream* cmap) {
96     // PDF spec defines that every bf* list can have at most 100 entries.
97     for (size_t i = 0; i < bfchar.size(); i += 100) {
98         int count = SkToInt(bfchar.size() - i);
99         count = std::min(count, 100);
100         cmap->writeDecAsText(count);
101         cmap->writeText(" beginbfchar\n");
102         for (int j = 0; j < count; ++j) {
103             cmap->writeText("<");
104             write_glyph(cmap, multiByte, bfchar[i + j].fGlyphId);
105             cmap->writeText("> <");
106             SkPDFUtils::WriteUTF16beHex(cmap, bfchar[i + j].fUnicode);
107             cmap->writeText(">\n");
108         }
109         cmap->writeText("endbfchar\n");
110     }
111 }
112 
append_bfchar_section_ex(const THashMap<SkGlyphID,SkString> & glyphToUnicodeEx,bool multiByte,SkGlyphID firstGlyphID,SkGlyphID lastGlyphID,SkDynamicMemoryWStream * cmap)113 static void append_bfchar_section_ex(const THashMap<SkGlyphID, SkString>& glyphToUnicodeEx,
114                                      bool multiByte, SkGlyphID firstGlyphID, SkGlyphID lastGlyphID,
115                                      SkDynamicMemoryWStream* cmap) {
116     size_t glyphCount = 0;
117     glyphToUnicodeEx.foreach([&](const SkGlyphID& glyphId, const SkString& a) {
118         if (glyphId < firstGlyphID || lastGlyphID < glyphId) {
119             return;
120         }
121         ++glyphCount;
122     });
123 
124     int glyphOffset = 0;
125     if (!multiByte) {
126         glyphOffset = firstGlyphID - 1;
127     }
128     // PDF spec defines that every bf* list can have at most 100 entries.
129     size_t i = 0;
130     glyphToUnicodeEx.foreach([&](const SkGlyphID& glyphId, const SkString& a) {
131         if (glyphId < firstGlyphID || lastGlyphID < glyphId) {
132             return;
133         }
134         if (i % 100 == 0) {
135             size_t count = glyphCount - i;
136             count = std::min(count, SkToSizeT(100));
137             cmap->writeDecAsText(count);
138             cmap->writeText(" beginbfchar\n");
139         }
140 
141         cmap->writeText("<");
142         write_glyph(cmap, multiByte, glyphId - glyphOffset);
143         cmap->writeText("> <");
144         const char* textPtr = a.begin();
145         const char* textEnd = a.end();
146         while (textPtr < textEnd) {
147             SkUnichar unichar = SkUTF::NextUTF8(&textPtr, textEnd);
148             SkPDFUtils::WriteUTF16beHex(cmap, unichar);
149         }
150         cmap->writeText(">\n");
151 
152         if (i % 100 == 99 || i == glyphCount - 1) {
153             cmap->writeText("endbfchar\n");
154         }
155         ++i;
156     });
157 }
158 
append_bfrange_section(const std::vector<BFRange> & bfrange,bool multiByte,SkDynamicMemoryWStream * cmap)159 static void append_bfrange_section(const std::vector<BFRange>& bfrange,
160                                    bool multiByte,
161                                    SkDynamicMemoryWStream* cmap) {
162     // PDF spec defines that every bf* list can have at most 100 entries.
163     for (size_t i = 0; i < bfrange.size(); i += 100) {
164         int count = SkToInt(bfrange.size() - i);
165         count = std::min(count, 100);
166         cmap->writeDecAsText(count);
167         cmap->writeText(" beginbfrange\n");
168         for (int j = 0; j < count; ++j) {
169             cmap->writeText("<");
170             write_glyph(cmap, multiByte, bfrange[i + j].fStart);
171             cmap->writeText("> <");
172             write_glyph(cmap, multiByte, bfrange[i + j].fEnd);
173             cmap->writeText("> <");
174             SkPDFUtils::WriteUTF16beHex(cmap, bfrange[i + j].fUnicode);
175             cmap->writeText(">\n");
176         }
177         cmap->writeText("endbfrange\n");
178     }
179 }
180 
181 // Generate <bfchar> and <bfrange> table according to PDF spec 1.4 and Adobe
182 // Technote 5014.
183 // The function is not static so we can test it in unit tests.
184 //
185 // Current implementation guarantees bfchar and bfrange entries do not overlap.
186 //
187 // Current implementation does not attempt aggressive optimizations against
188 // following case because the specification is not clear.
189 //
190 // 4 beginbfchar          1 beginbfchar
191 // <0003> <0013>          <0020> <0014>
192 // <0005> <0015>    to    endbfchar
193 // <0007> <0017>          1 beginbfrange
194 // <0020> <0014>          <0003> <0007> <0013>
195 // endbfchar              endbfrange
196 //
197 // Adobe Technote 5014 said: "Code mappings (unlike codespace ranges) may
198 // overlap, but succeeding maps supersede preceding maps."
199 //
200 // In case of searching text in PDF, bfrange will have higher precedence so
201 // typing char id 0x0014 in search box will get glyph id 0x0004 first.  However,
202 // the spec does not mention how will this kind of conflict being resolved.
203 //
204 // For the worst case (having 65536 continuous unicode and we use every other
205 // one of them), the possible savings by aggressive optimization is 416KB
206 // pre-compressed and does not provide enough motivation for implementation.
SkPDFAppendCmapSections(const SkUnichar * glyphToUnicode,const THashMap<SkGlyphID,SkString> & glyphToUnicodeEx,const SkPDFGlyphUse * subset,SkDynamicMemoryWStream * cmap,bool multiByteGlyphs,SkGlyphID firstGlyphID,SkGlyphID lastGlyphID)207 void SkPDFAppendCmapSections(const SkUnichar* glyphToUnicode,
208                              const THashMap<SkGlyphID, SkString>& glyphToUnicodeEx,
209                              const SkPDFGlyphUse* subset,
210                              SkDynamicMemoryWStream* cmap,
211                              bool multiByteGlyphs,
212                              SkGlyphID firstGlyphID,
213                              SkGlyphID lastGlyphID) {
214     int glyphOffset = 0;
215     if (!multiByteGlyphs) {
216         glyphOffset = firstGlyphID - 1;
217     }
218 
219     std::vector<BFChar> bfcharEntries;
220     std::vector<BFRange> bfrangeEntries;
221 
222     BFRange currentRangeEntry = {0, 0, 0};
223     bool rangeEmpty = true;
224     const int limit = (int)lastGlyphID + 1 - glyphOffset;
225 
226     for (int i = firstGlyphID - glyphOffset; i < limit + 1; ++i) {
227         SkGlyphID gid = i + glyphOffset;
228         bool inSubset = i < limit && (subset == nullptr || subset->has(gid));
229         if (!rangeEmpty) {
230             // PDF spec requires bfrange not changing the higher byte,
231             // e.g. <1035> <10FF> <2222> is ok, but
232             //      <1035> <1100> <2222> is no good
233             bool inRange =
234                 i == currentRangeEntry.fEnd + 1 &&
235                 i >> 8 == currentRangeEntry.fStart >> 8 &&
236                 i < limit &&
237                 glyphToUnicode[gid] ==
238                     currentRangeEntry.fUnicode + i - currentRangeEntry.fStart;
239             if (!inSubset || !inRange) {
240                 if (currentRangeEntry.fEnd > currentRangeEntry.fStart) {
241                     bfrangeEntries.push_back(currentRangeEntry);
242                 } else {
243                     bfcharEntries.push_back({currentRangeEntry.fStart, currentRangeEntry.fUnicode});
244                 }
245                 rangeEmpty = true;
246             }
247         }
248         if (inSubset) {
249             currentRangeEntry.fEnd = i;
250             if (rangeEmpty) {
251               currentRangeEntry.fStart = i;
252               currentRangeEntry.fUnicode = glyphToUnicode[gid];
253               rangeEmpty = false;
254             }
255         }
256     }
257 
258     // The spec requires all bfchar entries for a font must come before bfrange
259     // entries.
260     append_bfchar_section(bfcharEntries, multiByteGlyphs, cmap);
261     append_bfchar_section_ex(glyphToUnicodeEx, multiByteGlyphs, firstGlyphID, lastGlyphID, cmap);
262     append_bfrange_section(bfrangeEntries, multiByteGlyphs, cmap);
263 }
264 
SkPDFMakeToUnicodeCmap(const SkUnichar * glyphToUnicode,const THashMap<SkGlyphID,SkString> & glyphToUnicodeEx,const SkPDFGlyphUse * subset,bool multiByteGlyphs,SkGlyphID firstGlyphID,SkGlyphID lastGlyphID)265 std::unique_ptr<SkStreamAsset> SkPDFMakeToUnicodeCmap(
266         const SkUnichar* glyphToUnicode,
267         const THashMap<SkGlyphID, SkString>& glyphToUnicodeEx,
268         const SkPDFGlyphUse* subset,
269         bool multiByteGlyphs,
270         SkGlyphID firstGlyphID,
271         SkGlyphID lastGlyphID) {
272     SkDynamicMemoryWStream cmap;
273     append_tounicode_header(&cmap, multiByteGlyphs);
274     SkPDFAppendCmapSections(glyphToUnicode, glyphToUnicodeEx, subset, &cmap, multiByteGlyphs,
275                             firstGlyphID, lastGlyphID);
276     append_cmap_footer(&cmap);
277     return cmap.detachAsStream();
278 }
279