1 /*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "src/pdf/SkPDFMakeToUnicodeCmap.h"
9
10 #include "include/core/SkStream.h"
11 #include "include/core/SkString.h"
12 #include "include/private/base/SkTo.h"
13 #include "src/base/SkUTF.h"
14 #include "src/pdf/SkPDFGlyphUse.h"
15 #include "src/pdf/SkPDFUtils.h"
16
17 #include <algorithm>
18 #include <cstddef>
19 #include <vector>
20
21 using namespace skia_private;
22
append_tounicode_header(SkDynamicMemoryWStream * cmap,bool multibyte)23 static void append_tounicode_header(SkDynamicMemoryWStream* cmap,
24 bool multibyte) {
25 // 12 dict begin: 12 is an Adobe-suggested value. Shall not change.
26 // It's there to prevent old version Adobe Readers from malfunctioning.
27 const char* kHeader =
28 "/CIDInit /ProcSet findresource begin\n"
29 "12 dict begin\n"
30 "begincmap\n";
31 cmap->writeText(kHeader);
32
33 // The /CIDSystemInfo must be consistent to the one in
34 // SkPDFFont::populateCIDFont().
35 // We can not pass over the system info object here because the format is
36 // different. This is not a reference object.
37 const char* kSysInfo =
38 "/CIDSystemInfo\n"
39 "<< /Registry (Adobe)\n"
40 "/Ordering (UCS)\n"
41 "/Supplement 0\n"
42 ">> def\n";
43 cmap->writeText(kSysInfo);
44
45 // The CMapName must be consistent to /CIDSystemInfo above.
46 // /CMapType 2 means ToUnicode.
47 // Codespace range just tells the PDF processor the valid range.
48 const char* kTypeInfoHeader =
49 "/CMapName /Adobe-Identity-UCS def\n"
50 "/CMapType 2 def\n"
51 "1 begincodespacerange\n";
52 cmap->writeText(kTypeInfoHeader);
53 if (multibyte) {
54 cmap->writeText("<0000> <FFFF>\n");
55 } else {
56 cmap->writeText("<00> <FF>\n");
57 }
58 cmap->writeText("endcodespacerange\n");
59 }
60
append_cmap_footer(SkDynamicMemoryWStream * cmap)61 static void append_cmap_footer(SkDynamicMemoryWStream* cmap) {
62 const char kFooter[] =
63 "endcmap\n"
64 "CMapName currentdict /CMap defineresource pop\n"
65 "end\n"
66 "end";
67 cmap->writeText(kFooter);
68 }
69
70 namespace {
71 struct BFChar {
72 SkGlyphID fGlyphId;
73 SkUnichar fUnicode;
74 };
75
76 struct BFRange {
77 SkGlyphID fStart;
78 SkGlyphID fEnd;
79 SkUnichar fUnicode;
80 };
81 } // namespace
82
write_glyph(SkDynamicMemoryWStream * cmap,bool multiByte,SkGlyphID gid)83 static void write_glyph(SkDynamicMemoryWStream* cmap,
84 bool multiByte,
85 SkGlyphID gid) {
86 if (multiByte) {
87 SkPDFUtils::WriteUInt16BE(cmap, gid);
88 } else {
89 SkPDFUtils::WriteUInt8(cmap, SkToU8(gid));
90 }
91 }
92
append_bfchar_section(const std::vector<BFChar> & bfchar,bool multiByte,SkDynamicMemoryWStream * cmap)93 static void append_bfchar_section(const std::vector<BFChar>& bfchar,
94 bool multiByte,
95 SkDynamicMemoryWStream* cmap) {
96 // PDF spec defines that every bf* list can have at most 100 entries.
97 for (size_t i = 0; i < bfchar.size(); i += 100) {
98 int count = SkToInt(bfchar.size() - i);
99 count = std::min(count, 100);
100 cmap->writeDecAsText(count);
101 cmap->writeText(" beginbfchar\n");
102 for (int j = 0; j < count; ++j) {
103 cmap->writeText("<");
104 write_glyph(cmap, multiByte, bfchar[i + j].fGlyphId);
105 cmap->writeText("> <");
106 SkPDFUtils::WriteUTF16beHex(cmap, bfchar[i + j].fUnicode);
107 cmap->writeText(">\n");
108 }
109 cmap->writeText("endbfchar\n");
110 }
111 }
112
append_bfchar_section_ex(const THashMap<SkGlyphID,SkString> & glyphToUnicodeEx,bool multiByte,SkGlyphID firstGlyphID,SkGlyphID lastGlyphID,SkDynamicMemoryWStream * cmap)113 static void append_bfchar_section_ex(const THashMap<SkGlyphID, SkString>& glyphToUnicodeEx,
114 bool multiByte, SkGlyphID firstGlyphID, SkGlyphID lastGlyphID,
115 SkDynamicMemoryWStream* cmap) {
116 size_t glyphCount = 0;
117 glyphToUnicodeEx.foreach([&](const SkGlyphID& glyphId, const SkString& a) {
118 if (glyphId < firstGlyphID || lastGlyphID < glyphId) {
119 return;
120 }
121 ++glyphCount;
122 });
123
124 int glyphOffset = 0;
125 if (!multiByte) {
126 glyphOffset = firstGlyphID - 1;
127 }
128 // PDF spec defines that every bf* list can have at most 100 entries.
129 size_t i = 0;
130 glyphToUnicodeEx.foreach([&](const SkGlyphID& glyphId, const SkString& a) {
131 if (glyphId < firstGlyphID || lastGlyphID < glyphId) {
132 return;
133 }
134 if (i % 100 == 0) {
135 size_t count = glyphCount - i;
136 count = std::min(count, SkToSizeT(100));
137 cmap->writeDecAsText(count);
138 cmap->writeText(" beginbfchar\n");
139 }
140
141 cmap->writeText("<");
142 write_glyph(cmap, multiByte, glyphId - glyphOffset);
143 cmap->writeText("> <");
144 const char* textPtr = a.begin();
145 const char* textEnd = a.end();
146 while (textPtr < textEnd) {
147 SkUnichar unichar = SkUTF::NextUTF8(&textPtr, textEnd);
148 SkPDFUtils::WriteUTF16beHex(cmap, unichar);
149 }
150 cmap->writeText(">\n");
151
152 if (i % 100 == 99 || i == glyphCount - 1) {
153 cmap->writeText("endbfchar\n");
154 }
155 ++i;
156 });
157 }
158
append_bfrange_section(const std::vector<BFRange> & bfrange,bool multiByte,SkDynamicMemoryWStream * cmap)159 static void append_bfrange_section(const std::vector<BFRange>& bfrange,
160 bool multiByte,
161 SkDynamicMemoryWStream* cmap) {
162 // PDF spec defines that every bf* list can have at most 100 entries.
163 for (size_t i = 0; i < bfrange.size(); i += 100) {
164 int count = SkToInt(bfrange.size() - i);
165 count = std::min(count, 100);
166 cmap->writeDecAsText(count);
167 cmap->writeText(" beginbfrange\n");
168 for (int j = 0; j < count; ++j) {
169 cmap->writeText("<");
170 write_glyph(cmap, multiByte, bfrange[i + j].fStart);
171 cmap->writeText("> <");
172 write_glyph(cmap, multiByte, bfrange[i + j].fEnd);
173 cmap->writeText("> <");
174 SkPDFUtils::WriteUTF16beHex(cmap, bfrange[i + j].fUnicode);
175 cmap->writeText(">\n");
176 }
177 cmap->writeText("endbfrange\n");
178 }
179 }
180
181 // Generate <bfchar> and <bfrange> table according to PDF spec 1.4 and Adobe
182 // Technote 5014.
183 // The function is not static so we can test it in unit tests.
184 //
185 // Current implementation guarantees bfchar and bfrange entries do not overlap.
186 //
187 // Current implementation does not attempt aggressive optimizations against
188 // following case because the specification is not clear.
189 //
190 // 4 beginbfchar 1 beginbfchar
191 // <0003> <0013> <0020> <0014>
192 // <0005> <0015> to endbfchar
193 // <0007> <0017> 1 beginbfrange
194 // <0020> <0014> <0003> <0007> <0013>
195 // endbfchar endbfrange
196 //
197 // Adobe Technote 5014 said: "Code mappings (unlike codespace ranges) may
198 // overlap, but succeeding maps supersede preceding maps."
199 //
200 // In case of searching text in PDF, bfrange will have higher precedence so
201 // typing char id 0x0014 in search box will get glyph id 0x0004 first. However,
202 // the spec does not mention how will this kind of conflict being resolved.
203 //
204 // For the worst case (having 65536 continuous unicode and we use every other
205 // one of them), the possible savings by aggressive optimization is 416KB
206 // pre-compressed and does not provide enough motivation for implementation.
SkPDFAppendCmapSections(const SkUnichar * glyphToUnicode,const THashMap<SkGlyphID,SkString> & glyphToUnicodeEx,const SkPDFGlyphUse * subset,SkDynamicMemoryWStream * cmap,bool multiByteGlyphs,SkGlyphID firstGlyphID,SkGlyphID lastGlyphID)207 void SkPDFAppendCmapSections(const SkUnichar* glyphToUnicode,
208 const THashMap<SkGlyphID, SkString>& glyphToUnicodeEx,
209 const SkPDFGlyphUse* subset,
210 SkDynamicMemoryWStream* cmap,
211 bool multiByteGlyphs,
212 SkGlyphID firstGlyphID,
213 SkGlyphID lastGlyphID) {
214 int glyphOffset = 0;
215 if (!multiByteGlyphs) {
216 glyphOffset = firstGlyphID - 1;
217 }
218
219 std::vector<BFChar> bfcharEntries;
220 std::vector<BFRange> bfrangeEntries;
221
222 BFRange currentRangeEntry = {0, 0, 0};
223 bool rangeEmpty = true;
224 const int limit = (int)lastGlyphID + 1 - glyphOffset;
225
226 for (int i = firstGlyphID - glyphOffset; i < limit + 1; ++i) {
227 SkGlyphID gid = i + glyphOffset;
228 bool inSubset = i < limit && (subset == nullptr || subset->has(gid));
229 if (!rangeEmpty) {
230 // PDF spec requires bfrange not changing the higher byte,
231 // e.g. <1035> <10FF> <2222> is ok, but
232 // <1035> <1100> <2222> is no good
233 bool inRange =
234 i == currentRangeEntry.fEnd + 1 &&
235 i >> 8 == currentRangeEntry.fStart >> 8 &&
236 i < limit &&
237 glyphToUnicode[gid] ==
238 currentRangeEntry.fUnicode + i - currentRangeEntry.fStart;
239 if (!inSubset || !inRange) {
240 if (currentRangeEntry.fEnd > currentRangeEntry.fStart) {
241 bfrangeEntries.push_back(currentRangeEntry);
242 } else {
243 bfcharEntries.push_back({currentRangeEntry.fStart, currentRangeEntry.fUnicode});
244 }
245 rangeEmpty = true;
246 }
247 }
248 if (inSubset) {
249 currentRangeEntry.fEnd = i;
250 if (rangeEmpty) {
251 currentRangeEntry.fStart = i;
252 currentRangeEntry.fUnicode = glyphToUnicode[gid];
253 rangeEmpty = false;
254 }
255 }
256 }
257
258 // The spec requires all bfchar entries for a font must come before bfrange
259 // entries.
260 append_bfchar_section(bfcharEntries, multiByteGlyphs, cmap);
261 append_bfchar_section_ex(glyphToUnicodeEx, multiByteGlyphs, firstGlyphID, lastGlyphID, cmap);
262 append_bfrange_section(bfrangeEntries, multiByteGlyphs, cmap);
263 }
264
SkPDFMakeToUnicodeCmap(const SkUnichar * glyphToUnicode,const THashMap<SkGlyphID,SkString> & glyphToUnicodeEx,const SkPDFGlyphUse * subset,bool multiByteGlyphs,SkGlyphID firstGlyphID,SkGlyphID lastGlyphID)265 std::unique_ptr<SkStreamAsset> SkPDFMakeToUnicodeCmap(
266 const SkUnichar* glyphToUnicode,
267 const THashMap<SkGlyphID, SkString>& glyphToUnicodeEx,
268 const SkPDFGlyphUse* subset,
269 bool multiByteGlyphs,
270 SkGlyphID firstGlyphID,
271 SkGlyphID lastGlyphID) {
272 SkDynamicMemoryWStream cmap;
273 append_tounicode_header(&cmap, multiByteGlyphs);
274 SkPDFAppendCmapSections(glyphToUnicode, glyphToUnicodeEx, subset, &cmap, multiByteGlyphs,
275 firstGlyphID, lastGlyphID);
276 append_cmap_footer(&cmap);
277 return cmap.detachAsStream();
278 }
279