1 //! Mapping of characters to nominal glyph identifiers.
2 //!
3 //! The functionality in this module provides a 1-to-1 mapping from Unicode
4 //! characters (or [Unicode variation sequences](http://unicode.org/faq/vs.html)) to
5 //! nominal or "default" internal glyph identifiers for a given font.
6 //! This is a necessary first step, but generally insufficient for proper layout of
7 //! [complex text](https://en.wikipedia.org/wiki/Complex_text_layout) or even
8 //! simple text containing diacritics and ligatures.
9 //!
10 //! Comprehensive mapping of characters to positioned glyphs requires a process called
11 //! shaping. For more detail, see: [Why do I need a shaping engine?](https://harfbuzz.github.io/why-do-i-need-a-shaping-engine.html)
12
13 use read_fonts::{
14 tables::cmap::{
15 self, Cmap, Cmap12, Cmap12Iter, Cmap14, Cmap14Iter, Cmap4, Cmap4Iter, CmapSubtable,
16 EncodingRecord, PlatformId,
17 },
18 types::GlyphId,
19 FontData, TableProvider,
20 };
21
22 pub use read_fonts::tables::cmap::MapVariant;
23
24 /// Mapping of characters to nominal glyph identifiers.
25 ///
26 /// The mappings are derived from the [cmap](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap)
27 /// table.
28 ///
29 /// ## Selection strategy
30 ///
31 /// Fonts may contain multiple subtables in various formats supporting different encodings. The selection
32 /// strategy implemented here is designed to choose mappings that capture the broadest available Unicode
33 /// coverage:
34 ///
35 /// * Unicode characters: a symbol mapping subtable is selected if available. Otherwise, subtables supporting
36 /// the Unicode full repertoire or Basic Multilingual Plane (BMP) are preferred, in that order. Formats
37 /// [4](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-4-segment-mapping-to-delta-values)
38 /// and [12](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-12-segmented-coverage) are
39 /// supported.
40 ///
41 /// * Unicode variation sequences: these are provided by a format
42 /// [14](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-14-unicode-variation-sequences)
43 /// subtable.
44 ///
45 #[derive(Clone, Default)]
46 pub struct Charmap<'a> {
47 codepoint_subtable: Option<CodepointSubtable<'a>>,
48 variant_subtable: Option<Cmap14<'a>>,
49 }
50
51 impl<'a> Charmap<'a> {
52 /// Creates a new character map from the given font.
new(font: &impl TableProvider<'a>) -> Self53 pub fn new(font: &impl TableProvider<'a>) -> Self {
54 let Ok(cmap) = font.cmap() else {
55 return Default::default();
56 };
57 let selection = MappingSelection::new(&cmap);
58 Self {
59 codepoint_subtable: selection
60 .codepoint_subtable
61 .map(|subtable| CodepointSubtable {
62 subtable,
63 is_symbol: selection.mapping_index.codepoint_subtable_is_symbol,
64 }),
65 variant_subtable: selection.variant_subtable,
66 }
67 }
68
69 /// Returns true if a suitable Unicode character mapping is available.
has_map(&self) -> bool70 pub fn has_map(&self) -> bool {
71 self.codepoint_subtable.is_some()
72 }
73
74 /// Returns true if a symbol mapping was selected.
is_symbol(&self) -> bool75 pub fn is_symbol(&self) -> bool {
76 self.codepoint_subtable
77 .as_ref()
78 .map(|x| x.is_symbol)
79 .unwrap_or(false)
80 }
81
82 /// Returns true if a Unicode variation sequence mapping is available.
has_variant_map(&self) -> bool83 pub fn has_variant_map(&self) -> bool {
84 self.variant_subtable.is_some()
85 }
86
87 /// Maps a character to a nominal glyph identifier.
88 ///
89 /// Returns `None` if a mapping does not exist.
map(&self, ch: impl Into<u32>) -> Option<GlyphId>90 pub fn map(&self, ch: impl Into<u32>) -> Option<GlyphId> {
91 self.codepoint_subtable.as_ref()?.map(ch.into())
92 }
93
94 /// Returns an iterator over all mappings of codepoint to nominal glyph
95 /// identifiers in the character map.
mappings(&self) -> Mappings<'a>96 pub fn mappings(&self) -> Mappings<'a> {
97 self.codepoint_subtable
98 .as_ref()
99 .map(|subtable| {
100 Mappings(match &subtable.subtable {
101 SupportedSubtable::Format4(cmap4) => MappingsInner::Format4(cmap4.iter()),
102 SupportedSubtable::Format12(cmap12) => MappingsInner::Format12(cmap12.iter()),
103 })
104 })
105 .unwrap_or(Mappings(MappingsInner::None))
106 }
107
108 /// Maps a character and variation selector to a nominal glyph identifier.
109 ///
110 /// Returns `None` if a mapping does not exist.
map_variant(&self, ch: impl Into<u32>, selector: impl Into<u32>) -> Option<MapVariant>111 pub fn map_variant(&self, ch: impl Into<u32>, selector: impl Into<u32>) -> Option<MapVariant> {
112 self.variant_subtable.as_ref()?.map_variant(ch, selector)
113 }
114
115 /// Returns an iterator over all mappings of character and variation
116 /// selector to nominal glyph identifier in the character map.
variant_mappings(&self) -> VariantMappings<'a>117 pub fn variant_mappings(&self) -> VariantMappings<'a> {
118 VariantMappings(self.variant_subtable.clone().map(|cmap14| cmap14.iter()))
119 }
120 }
121
122 /// Cacheable indices of selected mapping tables for materializing a character
123 /// map.
124 ///
125 /// Since [`Charmap`] carries a lifetime, it is difficult to store in a cache.
126 /// This type serves as an acceleration structure that allows for construction
127 /// of a character map while skipping the search for the most suitable Unicode
128 /// mappings.
129 #[derive(Copy, Clone, Default, Debug)]
130 pub struct MappingIndex {
131 /// Index of Unicode or symbol mapping subtable.
132 codepoint_subtable: Option<u16>,
133 /// True if the above is a symbol mapping.
134 codepoint_subtable_is_symbol: bool,
135 /// Index of Unicode variation selector subtable.
136 variant_subtable: Option<u16>,
137 }
138
139 impl MappingIndex {
140 /// Finds the indices of the most suitable Unicode mapping tables in the
141 /// given font.
new<'a>(font: &impl TableProvider<'a>) -> Self142 pub fn new<'a>(font: &impl TableProvider<'a>) -> Self {
143 let Ok(cmap) = font.cmap() else {
144 return Default::default();
145 };
146 MappingSelection::new(&cmap).mapping_index
147 }
148
149 /// Creates a new character map for the given font using the tables referenced by
150 /// the precomputed indices.
151 ///
152 /// The font should be the same as the one used to construct this object.
charmap<'a>(&self, font: &impl TableProvider<'a>) -> Charmap<'a>153 pub fn charmap<'a>(&self, font: &impl TableProvider<'a>) -> Charmap<'a> {
154 let Ok(cmap) = font.cmap() else {
155 return Default::default();
156 };
157 let records = cmap.encoding_records();
158 let data = cmap.offset_data();
159 Charmap {
160 codepoint_subtable: self
161 .codepoint_subtable
162 .and_then(|index| get_subtable(data, records, index))
163 .and_then(SupportedSubtable::new)
164 .map(|subtable| CodepointSubtable {
165 subtable,
166 is_symbol: self.codepoint_subtable_is_symbol,
167 }),
168 variant_subtable: self
169 .variant_subtable
170 .and_then(|index| get_subtable(data, records, index))
171 .and_then(|subtable| match subtable {
172 CmapSubtable::Format14(cmap14) => Some(cmap14),
173 _ => None,
174 }),
175 }
176 }
177 }
178
179 /// Iterator over all mappings of character to nominal glyph identifier
180 /// in a character map.
181 ///
182 /// This is created with the [`Charmap::mappings`] method.
183 #[derive(Clone)]
184 pub struct Mappings<'a>(MappingsInner<'a>);
185
186 impl<'a> Iterator for Mappings<'a> {
187 type Item = (u32, GlyphId);
188
next(&mut self) -> Option<Self::Item>189 fn next(&mut self) -> Option<Self::Item> {
190 match &mut self.0 {
191 MappingsInner::None => None,
192 MappingsInner::Format4(iter) => iter.next(),
193 MappingsInner::Format12(iter) => iter.next(),
194 }
195 }
196 }
197
198 #[derive(Clone)]
199 enum MappingsInner<'a> {
200 None,
201 Format4(Cmap4Iter<'a>),
202 Format12(Cmap12Iter<'a>),
203 }
204
205 /// Iterator over all mappings of character and variation selector to
206 /// nominal glyph identifier in a character map.
207 ///
208 /// This is created with the [`Charmap::variant_mappings`] method.
209 #[derive(Clone)]
210 pub struct VariantMappings<'a>(Option<Cmap14Iter<'a>>);
211
212 impl<'a> Iterator for VariantMappings<'a> {
213 type Item = (u32, u32, MapVariant);
214
next(&mut self) -> Option<Self::Item>215 fn next(&mut self) -> Option<Self::Item> {
216 self.0.as_mut()?.next()
217 }
218 }
219
get_subtable<'a>( data: FontData<'a>, records: &[EncodingRecord], index: u16, ) -> Option<CmapSubtable<'a>>220 fn get_subtable<'a>(
221 data: FontData<'a>,
222 records: &[EncodingRecord],
223 index: u16,
224 ) -> Option<CmapSubtable<'a>> {
225 records
226 .get(index as usize)
227 .and_then(|record| record.subtable(data).ok())
228 }
229
230 #[derive(Clone)]
231 struct CodepointSubtable<'a> {
232 subtable: SupportedSubtable<'a>,
233 /// True if the subtable is a symbol mapping.
234 is_symbol: bool,
235 }
236
237 impl<'a> CodepointSubtable<'a> {
map(&self, codepoint: u32) -> Option<GlyphId>238 fn map(&self, codepoint: u32) -> Option<GlyphId> {
239 self.map_impl(codepoint).or_else(|| {
240 if self.is_symbol && codepoint <= 0x00FF {
241 // From HarfBuzz:
242 // For symbol-encoded OpenType fonts, we duplicate the
243 // U+F000..F0FF range at U+0000..U+00FF. That's what
244 // Windows seems to do, and that's hinted about at:
245 // https://docs.microsoft.com/en-us/typography/opentype/spec/recom
246 // under "Non-Standard (Symbol) Fonts".
247 // See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1595>
248 self.map_impl(codepoint + 0xF000)
249 } else {
250 None
251 }
252 })
253 }
254
map_impl(&self, codepoint: u32) -> Option<GlyphId>255 fn map_impl(&self, codepoint: u32) -> Option<GlyphId> {
256 match &self.subtable {
257 SupportedSubtable::Format4(subtable) => subtable.map_codepoint(codepoint),
258 SupportedSubtable::Format12(subtable) => subtable.map_codepoint(codepoint),
259 }
260 }
261 }
262
263 #[derive(Clone)]
264 enum SupportedSubtable<'a> {
265 Format4(Cmap4<'a>),
266 Format12(Cmap12<'a>),
267 }
268
269 impl<'a> SupportedSubtable<'a> {
new(subtable: CmapSubtable<'a>) -> Option<Self>270 fn new(subtable: CmapSubtable<'a>) -> Option<Self> {
271 Some(match subtable {
272 CmapSubtable::Format4(cmap4) => Self::Format4(cmap4),
273 CmapSubtable::Format12(cmap12) => Self::Format12(cmap12),
274 _ => return None,
275 })
276 }
277
from_cmap_record(cmap: &Cmap<'a>, record: &cmap::EncodingRecord) -> Option<Self>278 fn from_cmap_record(cmap: &Cmap<'a>, record: &cmap::EncodingRecord) -> Option<Self> {
279 Self::new(record.subtable(cmap.offset_data()).ok()?)
280 }
281 }
282
283 /// The mapping kind of a cmap subtable.
284 ///
285 /// The ordering is significant and determines the priority of subtable
286 /// selection (greater is better).
287 #[derive(Copy, Clone, PartialEq, PartialOrd)]
288 enum MappingKind {
289 None = 0,
290 UnicodeBmp = 1,
291 UnicodeFull = 2,
292 Symbol = 3,
293 }
294
295 /// The result of searching the cmap table for the "best" available
296 /// subtables.
297 ///
298 /// For `codepoint_subtable`, best means either symbol (which is preferred)
299 /// or a Unicode subtable with the greatest coverage.
300 ///
301 /// For `variant_subtable`, best means a format 14 subtable.
302 struct MappingSelection<'a> {
303 /// The mapping index accelerator that holds indices of the following
304 /// subtables.
305 mapping_index: MappingIndex,
306 /// Either a symbol subtable or the Unicode subtable with the
307 /// greatest coverage.
308 codepoint_subtable: Option<SupportedSubtable<'a>>,
309 /// Subtable that supports mapping Unicode variation sequences.
310 variant_subtable: Option<Cmap14<'a>>,
311 }
312
313 impl<'a> MappingSelection<'a> {
new(cmap: &Cmap<'a>) -> Self314 fn new(cmap: &Cmap<'a>) -> Self {
315 const ENCODING_MS_SYMBOL: u16 = 0;
316 const ENCODING_MS_UNICODE_CS: u16 = 1;
317 const ENCODING_APPLE_ID_UNICODE_32: u16 = 4;
318 const ENCODING_APPLE_ID_VARIANT_SELECTOR: u16 = 5;
319 const ENCODING_MS_ID_UCS_4: u16 = 10;
320 let mut mapping_index = MappingIndex::default();
321 let mut mapping_kind = MappingKind::None;
322 let mut codepoint_subtable = None;
323 let mut variant_subtable = None;
324 let mut maybe_choose_subtable = |kind, index, subtable| {
325 if kind > mapping_kind {
326 mapping_kind = kind;
327 mapping_index.codepoint_subtable_is_symbol = kind == MappingKind::Symbol;
328 mapping_index.codepoint_subtable = Some(index as u16);
329 codepoint_subtable = Some(subtable);
330 }
331 };
332 // This generally follows the same strategy as FreeType, searching the encoding
333 // records in reverse and prioritizing UCS-4 subtables over UCS-2.
334 // See <https://gitlab.freedesktop.org/freetype/freetype/-/blob/ac5babe87629107c43f627e2cd17c6cf4f2ecd43/src/base/ftobjs.c#L1370>
335 // The exception is that we prefer a symbol subtable over all others which matches the behavior
336 // of HarfBuzz.
337 // See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1818>
338 for (i, record) in cmap.encoding_records().iter().enumerate().rev() {
339 match (record.platform_id(), record.encoding_id()) {
340 (PlatformId::Unicode, ENCODING_APPLE_ID_VARIANT_SELECTOR) => {
341 // Unicode variation sequences
342 if let Ok(CmapSubtable::Format14(subtable)) =
343 record.subtable(cmap.offset_data())
344 {
345 if variant_subtable.is_none() {
346 mapping_index.variant_subtable = Some(i as u16);
347 variant_subtable = Some(subtable);
348 }
349 }
350 }
351 (PlatformId::Windows, ENCODING_MS_SYMBOL) => {
352 // Symbol
353 if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
354 maybe_choose_subtable(MappingKind::Symbol, i, subtable);
355 }
356 }
357 (PlatformId::Windows, ENCODING_MS_ID_UCS_4)
358 | (PlatformId::Unicode, ENCODING_APPLE_ID_UNICODE_32) => {
359 // Unicode full repertoire
360 if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
361 maybe_choose_subtable(MappingKind::UnicodeFull, i, subtable);
362 }
363 }
364 (PlatformId::ISO, _)
365 | (PlatformId::Unicode, _)
366 | (PlatformId::Windows, ENCODING_MS_UNICODE_CS) => {
367 // Unicode BMP only
368 if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
369 maybe_choose_subtable(MappingKind::UnicodeBmp, i, subtable);
370 }
371 }
372 _ => {}
373 }
374 }
375 Self {
376 mapping_index,
377 codepoint_subtable,
378 variant_subtable,
379 }
380 }
381 }
382
383 #[cfg(test)]
384 mod tests {
385 use super::*;
386 use crate::MetadataProvider;
387 use read_fonts::FontRef;
388
389 #[test]
choose_format_12_over_4()390 fn choose_format_12_over_4() {
391 let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
392 let charmap = font.charmap();
393 assert!(matches!(
394 charmap.codepoint_subtable.unwrap().subtable,
395 SupportedSubtable::Format12(..)
396 ));
397 }
398
399 #[test]
choose_format_4()400 fn choose_format_4() {
401 let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
402 let charmap = font.charmap();
403 assert!(matches!(
404 charmap.codepoint_subtable.unwrap().subtable,
405 SupportedSubtable::Format4(..)
406 ));
407 }
408
409 #[test]
choose_symbol()410 fn choose_symbol() {
411 let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
412 let charmap = font.charmap();
413 assert!(charmap.is_symbol());
414 assert!(matches!(
415 charmap.codepoint_subtable.unwrap().subtable,
416 SupportedSubtable::Format4(..)
417 ));
418 }
419
420 #[test]
map_format_4()421 fn map_format_4() {
422 let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
423 let charmap = font.charmap();
424 assert_eq!(charmap.map('A'), Some(GlyphId::new(1)));
425 assert_eq!(charmap.map('À'), Some(GlyphId::new(2)));
426 assert_eq!(charmap.map('`'), Some(GlyphId::new(3)));
427 assert_eq!(charmap.map('B'), None);
428 }
429
430 #[test]
map_format_12()431 fn map_format_12() {
432 let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
433 let charmap = font.charmap();
434 assert_eq!(charmap.map(' '), None);
435 assert_eq!(charmap.map(0x101723_u32), Some(GlyphId::new(1)));
436 assert_eq!(charmap.map(0x101725_u32), Some(GlyphId::new(3)));
437 assert_eq!(charmap.map(0x102523_u32), Some(GlyphId::new(6)));
438 assert_eq!(charmap.map(0x102526_u32), Some(GlyphId::new(9)));
439 assert_eq!(charmap.map(0x102527_u32), Some(GlyphId::new(10)));
440 }
441
442 #[test]
map_symbol_pua()443 fn map_symbol_pua() {
444 let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
445 let charmap = font.charmap();
446 assert!(charmap.codepoint_subtable.as_ref().unwrap().is_symbol);
447 assert_eq!(charmap.map(0xF001_u32), Some(GlyphId::new(1)));
448 assert_eq!(charmap.map(0xF002_u32), Some(GlyphId::new(2)));
449 assert_eq!(charmap.map(0xF003_u32), Some(GlyphId::new(3)));
450 assert_eq!(charmap.map(0xF0FE_u32), Some(GlyphId::new(4)));
451 // The following don't exist in the cmap table and are remapped into the U+F000..F0FF range
452 // due to the selection of a symbol mapping subtable.
453 assert_eq!(charmap.map(0x1_u32), Some(GlyphId::new(1)));
454 assert_eq!(charmap.map(0x2_u32), Some(GlyphId::new(2)));
455 assert_eq!(charmap.map(0x3_u32), Some(GlyphId::new(3)));
456 assert_eq!(charmap.map(0xFE_u32), Some(GlyphId::new(4)));
457 }
458
459 #[test]
map_variants()460 fn map_variants() {
461 use super::MapVariant::*;
462 let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
463 let charmap = font.charmap();
464 let selector = '\u{e0100}';
465 assert_eq!(charmap.map_variant('a', selector), None);
466 assert_eq!(charmap.map_variant('\u{4e00}', selector), Some(UseDefault));
467 assert_eq!(charmap.map_variant('\u{4e06}', selector), Some(UseDefault));
468 assert_eq!(
469 charmap.map_variant('\u{4e08}', selector),
470 Some(Variant(GlyphId::new(25)))
471 );
472 assert_eq!(
473 charmap.map_variant('\u{4e09}', selector),
474 Some(Variant(GlyphId::new(26)))
475 );
476 }
477
478 #[test]
mappings()479 fn mappings() {
480 for font_data in [
481 font_test_data::VAZIRMATN_VAR,
482 font_test_data::CMAP12_FONT1,
483 font_test_data::SIMPLE_GLYF,
484 font_test_data::CMAP4_SYMBOL_PUA,
485 ] {
486 let font = FontRef::new(font_data).unwrap();
487 let charmap = font.charmap();
488 for (codepoint, glyph_id) in charmap.mappings() {
489 assert_eq!(charmap.map(codepoint), Some(glyph_id));
490 }
491 }
492 }
493
494 #[test]
variant_mappings()495 fn variant_mappings() {
496 let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
497 let charmap = font.charmap();
498 for (codepoint, selector, variant) in charmap.variant_mappings() {
499 assert_eq!(charmap.map_variant(codepoint, selector), Some(variant));
500 }
501 }
502 }
503