xref: /aosp_15_r20/external/cronet/third_party/rust/chromium_crates_io/vendor/skrifa-0.15.5/src/charmap.rs (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 //! Mapping of characters to nominal glyph identifiers.
2 //!
3 //! The functionality in this module provides a 1-to-1 mapping from Unicode
4 //! characters (or [Unicode variation sequences](http://unicode.org/faq/vs.html)) to
5 //! nominal or "default" internal glyph identifiers for a given font.
6 //! This is a necessary first step, but generally insufficient for proper layout of
7 //! [complex text](https://en.wikipedia.org/wiki/Complex_text_layout) or even
8 //! simple text containing diacritics and ligatures.
9 //!
10 //! Comprehensive mapping of characters to positioned glyphs requires a process called
11 //! shaping. For more detail, see: [Why do I need a shaping engine?](https://harfbuzz.github.io/why-do-i-need-a-shaping-engine.html)
12 
13 use read_fonts::{
14     tables::cmap::{
15         self, Cmap, Cmap12, Cmap12Iter, Cmap14, Cmap14Iter, Cmap4, Cmap4Iter, CmapSubtable,
16         EncodingRecord, PlatformId,
17     },
18     types::GlyphId,
19     FontData, TableProvider,
20 };
21 
22 pub use read_fonts::tables::cmap::MapVariant;
23 
24 /// Mapping of characters to nominal glyph identifiers.
25 ///
26 /// The mappings are derived from the [cmap](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap)
27 /// table.
28 ///
29 /// ## Selection strategy
30 ///
31 /// Fonts may contain multiple subtables in various formats supporting different encodings. The selection
32 /// strategy implemented here is designed to choose mappings that capture the broadest available Unicode
33 /// coverage:
34 ///
35 /// * Unicode characters: a symbol mapping subtable is selected if available. Otherwise, subtables supporting
36 /// the Unicode full repertoire or Basic Multilingual Plane (BMP) are preferred, in that order. Formats
37 /// [4](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-4-segment-mapping-to-delta-values)
38 /// and [12](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-12-segmented-coverage) are
39 /// supported.
40 ///
41 /// * Unicode variation sequences: these are provided by a format
42 /// [14](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-14-unicode-variation-sequences)
43 /// subtable.
44 ///
45 #[derive(Clone, Default)]
46 pub struct Charmap<'a> {
47     codepoint_subtable: Option<CodepointSubtable<'a>>,
48     variant_subtable: Option<Cmap14<'a>>,
49 }
50 
51 impl<'a> Charmap<'a> {
52     /// Creates a new character map from the given font.
new(font: &impl TableProvider<'a>) -> Self53     pub fn new(font: &impl TableProvider<'a>) -> Self {
54         let Ok(cmap) = font.cmap() else {
55             return Default::default();
56         };
57         let selection = MappingSelection::new(&cmap);
58         Self {
59             codepoint_subtable: selection
60                 .codepoint_subtable
61                 .map(|subtable| CodepointSubtable {
62                     subtable,
63                     is_symbol: selection.mapping_index.codepoint_subtable_is_symbol,
64                 }),
65             variant_subtable: selection.variant_subtable,
66         }
67     }
68 
69     /// Returns true if a suitable Unicode character mapping is available.
has_map(&self) -> bool70     pub fn has_map(&self) -> bool {
71         self.codepoint_subtable.is_some()
72     }
73 
74     /// Returns true if a symbol mapping was selected.
is_symbol(&self) -> bool75     pub fn is_symbol(&self) -> bool {
76         self.codepoint_subtable
77             .as_ref()
78             .map(|x| x.is_symbol)
79             .unwrap_or(false)
80     }
81 
82     /// Returns true if a Unicode variation sequence mapping is available.
has_variant_map(&self) -> bool83     pub fn has_variant_map(&self) -> bool {
84         self.variant_subtable.is_some()
85     }
86 
87     /// Maps a character to a nominal glyph identifier.
88     ///
89     /// Returns `None` if a mapping does not exist.
map(&self, ch: impl Into<u32>) -> Option<GlyphId>90     pub fn map(&self, ch: impl Into<u32>) -> Option<GlyphId> {
91         self.codepoint_subtable.as_ref()?.map(ch.into())
92     }
93 
94     /// Returns an iterator over all mappings of codepoint to nominal glyph
95     /// identifiers in the character map.
mappings(&self) -> Mappings<'a>96     pub fn mappings(&self) -> Mappings<'a> {
97         self.codepoint_subtable
98             .as_ref()
99             .map(|subtable| {
100                 Mappings(match &subtable.subtable {
101                     SupportedSubtable::Format4(cmap4) => MappingsInner::Format4(cmap4.iter()),
102                     SupportedSubtable::Format12(cmap12) => MappingsInner::Format12(cmap12.iter()),
103                 })
104             })
105             .unwrap_or(Mappings(MappingsInner::None))
106     }
107 
108     /// Maps a character and variation selector to a nominal glyph identifier.
109     ///
110     /// Returns `None` if a mapping does not exist.
map_variant(&self, ch: impl Into<u32>, selector: impl Into<u32>) -> Option<MapVariant>111     pub fn map_variant(&self, ch: impl Into<u32>, selector: impl Into<u32>) -> Option<MapVariant> {
112         self.variant_subtable.as_ref()?.map_variant(ch, selector)
113     }
114 
115     /// Returns an iterator over all mappings of character and variation
116     /// selector to nominal glyph identifier in the character map.
variant_mappings(&self) -> VariantMappings<'a>117     pub fn variant_mappings(&self) -> VariantMappings<'a> {
118         VariantMappings(self.variant_subtable.clone().map(|cmap14| cmap14.iter()))
119     }
120 }
121 
122 /// Cacheable indices of selected mapping tables for materializing a character
123 /// map.
124 ///
125 /// Since [`Charmap`] carries a lifetime, it is difficult to store in a cache.
126 /// This type serves as an acceleration structure that allows for construction
127 /// of a character map while skipping the search for the most suitable Unicode
128 /// mappings.
129 #[derive(Copy, Clone, Default, Debug)]
130 pub struct MappingIndex {
131     /// Index of Unicode or symbol mapping subtable.
132     codepoint_subtable: Option<u16>,
133     /// True if the above is a symbol mapping.
134     codepoint_subtable_is_symbol: bool,
135     /// Index of Unicode variation selector subtable.
136     variant_subtable: Option<u16>,
137 }
138 
139 impl MappingIndex {
140     /// Finds the indices of the most suitable Unicode mapping tables in the
141     /// given font.
new<'a>(font: &impl TableProvider<'a>) -> Self142     pub fn new<'a>(font: &impl TableProvider<'a>) -> Self {
143         let Ok(cmap) = font.cmap() else {
144             return Default::default();
145         };
146         MappingSelection::new(&cmap).mapping_index
147     }
148 
149     /// Creates a new character map for the given font using the tables referenced by
150     /// the precomputed indices.
151     ///
152     /// The font should be the same as the one used to construct this object.
charmap<'a>(&self, font: &impl TableProvider<'a>) -> Charmap<'a>153     pub fn charmap<'a>(&self, font: &impl TableProvider<'a>) -> Charmap<'a> {
154         let Ok(cmap) = font.cmap() else {
155             return Default::default();
156         };
157         let records = cmap.encoding_records();
158         let data = cmap.offset_data();
159         Charmap {
160             codepoint_subtable: self
161                 .codepoint_subtable
162                 .and_then(|index| get_subtable(data, records, index))
163                 .and_then(SupportedSubtable::new)
164                 .map(|subtable| CodepointSubtable {
165                     subtable,
166                     is_symbol: self.codepoint_subtable_is_symbol,
167                 }),
168             variant_subtable: self
169                 .variant_subtable
170                 .and_then(|index| get_subtable(data, records, index))
171                 .and_then(|subtable| match subtable {
172                     CmapSubtable::Format14(cmap14) => Some(cmap14),
173                     _ => None,
174                 }),
175         }
176     }
177 }
178 
179 /// Iterator over all mappings of character to nominal glyph identifier
180 /// in a character map.
181 ///
182 /// This is created with the [`Charmap::mappings`] method.
183 #[derive(Clone)]
184 pub struct Mappings<'a>(MappingsInner<'a>);
185 
186 impl<'a> Iterator for Mappings<'a> {
187     type Item = (u32, GlyphId);
188 
next(&mut self) -> Option<Self::Item>189     fn next(&mut self) -> Option<Self::Item> {
190         match &mut self.0 {
191             MappingsInner::None => None,
192             MappingsInner::Format4(iter) => iter.next(),
193             MappingsInner::Format12(iter) => iter.next(),
194         }
195     }
196 }
197 
198 #[derive(Clone)]
199 enum MappingsInner<'a> {
200     None,
201     Format4(Cmap4Iter<'a>),
202     Format12(Cmap12Iter<'a>),
203 }
204 
205 /// Iterator over all mappings of character and variation selector to
206 /// nominal glyph identifier in a character map.
207 ///
208 /// This is created with the [`Charmap::variant_mappings`] method.
209 #[derive(Clone)]
210 pub struct VariantMappings<'a>(Option<Cmap14Iter<'a>>);
211 
212 impl<'a> Iterator for VariantMappings<'a> {
213     type Item = (u32, u32, MapVariant);
214 
next(&mut self) -> Option<Self::Item>215     fn next(&mut self) -> Option<Self::Item> {
216         self.0.as_mut()?.next()
217     }
218 }
219 
get_subtable<'a>( data: FontData<'a>, records: &[EncodingRecord], index: u16, ) -> Option<CmapSubtable<'a>>220 fn get_subtable<'a>(
221     data: FontData<'a>,
222     records: &[EncodingRecord],
223     index: u16,
224 ) -> Option<CmapSubtable<'a>> {
225     records
226         .get(index as usize)
227         .and_then(|record| record.subtable(data).ok())
228 }
229 
230 #[derive(Clone)]
231 struct CodepointSubtable<'a> {
232     subtable: SupportedSubtable<'a>,
233     /// True if the subtable is a symbol mapping.
234     is_symbol: bool,
235 }
236 
237 impl<'a> CodepointSubtable<'a> {
map(&self, codepoint: u32) -> Option<GlyphId>238     fn map(&self, codepoint: u32) -> Option<GlyphId> {
239         self.map_impl(codepoint).or_else(|| {
240             if self.is_symbol && codepoint <= 0x00FF {
241                 // From HarfBuzz:
242                 // For symbol-encoded OpenType fonts, we duplicate the
243                 // U+F000..F0FF range at U+0000..U+00FF.  That's what
244                 // Windows seems to do, and that's hinted about at:
245                 // https://docs.microsoft.com/en-us/typography/opentype/spec/recom
246                 // under "Non-Standard (Symbol) Fonts".
247                 // See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1595>
248                 self.map_impl(codepoint + 0xF000)
249             } else {
250                 None
251             }
252         })
253     }
254 
map_impl(&self, codepoint: u32) -> Option<GlyphId>255     fn map_impl(&self, codepoint: u32) -> Option<GlyphId> {
256         match &self.subtable {
257             SupportedSubtable::Format4(subtable) => subtable.map_codepoint(codepoint),
258             SupportedSubtable::Format12(subtable) => subtable.map_codepoint(codepoint),
259         }
260     }
261 }
262 
263 #[derive(Clone)]
264 enum SupportedSubtable<'a> {
265     Format4(Cmap4<'a>),
266     Format12(Cmap12<'a>),
267 }
268 
269 impl<'a> SupportedSubtable<'a> {
new(subtable: CmapSubtable<'a>) -> Option<Self>270     fn new(subtable: CmapSubtable<'a>) -> Option<Self> {
271         Some(match subtable {
272             CmapSubtable::Format4(cmap4) => Self::Format4(cmap4),
273             CmapSubtable::Format12(cmap12) => Self::Format12(cmap12),
274             _ => return None,
275         })
276     }
277 
from_cmap_record(cmap: &Cmap<'a>, record: &cmap::EncodingRecord) -> Option<Self>278     fn from_cmap_record(cmap: &Cmap<'a>, record: &cmap::EncodingRecord) -> Option<Self> {
279         Self::new(record.subtable(cmap.offset_data()).ok()?)
280     }
281 }
282 
283 /// The mapping kind of a cmap subtable.
284 ///
285 /// The ordering is significant and determines the priority of subtable
286 /// selection (greater is better).
287 #[derive(Copy, Clone, PartialEq, PartialOrd)]
288 enum MappingKind {
289     None = 0,
290     UnicodeBmp = 1,
291     UnicodeFull = 2,
292     Symbol = 3,
293 }
294 
295 /// The result of searching the cmap table for the "best" available
296 /// subtables.
297 ///
298 /// For `codepoint_subtable`, best means either symbol (which is preferred)
299 /// or a Unicode subtable with the greatest coverage.
300 ///
301 /// For `variant_subtable`, best means a format 14 subtable.
302 struct MappingSelection<'a> {
303     /// The mapping index accelerator that holds indices of the following
304     /// subtables.
305     mapping_index: MappingIndex,
306     /// Either a symbol subtable or the Unicode subtable with the
307     /// greatest coverage.
308     codepoint_subtable: Option<SupportedSubtable<'a>>,
309     /// Subtable that supports mapping Unicode variation sequences.
310     variant_subtable: Option<Cmap14<'a>>,
311 }
312 
313 impl<'a> MappingSelection<'a> {
new(cmap: &Cmap<'a>) -> Self314     fn new(cmap: &Cmap<'a>) -> Self {
315         const ENCODING_MS_SYMBOL: u16 = 0;
316         const ENCODING_MS_UNICODE_CS: u16 = 1;
317         const ENCODING_APPLE_ID_UNICODE_32: u16 = 4;
318         const ENCODING_APPLE_ID_VARIANT_SELECTOR: u16 = 5;
319         const ENCODING_MS_ID_UCS_4: u16 = 10;
320         let mut mapping_index = MappingIndex::default();
321         let mut mapping_kind = MappingKind::None;
322         let mut codepoint_subtable = None;
323         let mut variant_subtable = None;
324         let mut maybe_choose_subtable = |kind, index, subtable| {
325             if kind > mapping_kind {
326                 mapping_kind = kind;
327                 mapping_index.codepoint_subtable_is_symbol = kind == MappingKind::Symbol;
328                 mapping_index.codepoint_subtable = Some(index as u16);
329                 codepoint_subtable = Some(subtable);
330             }
331         };
332         // This generally follows the same strategy as FreeType, searching the encoding
333         // records in reverse and prioritizing UCS-4 subtables over UCS-2.
334         // See <https://gitlab.freedesktop.org/freetype/freetype/-/blob/ac5babe87629107c43f627e2cd17c6cf4f2ecd43/src/base/ftobjs.c#L1370>
335         // The exception is that we prefer a symbol subtable over all others which matches the behavior
336         // of HarfBuzz.
337         // See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1818>
338         for (i, record) in cmap.encoding_records().iter().enumerate().rev() {
339             match (record.platform_id(), record.encoding_id()) {
340                 (PlatformId::Unicode, ENCODING_APPLE_ID_VARIANT_SELECTOR) => {
341                     // Unicode variation sequences
342                     if let Ok(CmapSubtable::Format14(subtable)) =
343                         record.subtable(cmap.offset_data())
344                     {
345                         if variant_subtable.is_none() {
346                             mapping_index.variant_subtable = Some(i as u16);
347                             variant_subtable = Some(subtable);
348                         }
349                     }
350                 }
351                 (PlatformId::Windows, ENCODING_MS_SYMBOL) => {
352                     // Symbol
353                     if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
354                         maybe_choose_subtable(MappingKind::Symbol, i, subtable);
355                     }
356                 }
357                 (PlatformId::Windows, ENCODING_MS_ID_UCS_4)
358                 | (PlatformId::Unicode, ENCODING_APPLE_ID_UNICODE_32) => {
359                     // Unicode full repertoire
360                     if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
361                         maybe_choose_subtable(MappingKind::UnicodeFull, i, subtable);
362                     }
363                 }
364                 (PlatformId::ISO, _)
365                 | (PlatformId::Unicode, _)
366                 | (PlatformId::Windows, ENCODING_MS_UNICODE_CS) => {
367                     // Unicode BMP only
368                     if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
369                         maybe_choose_subtable(MappingKind::UnicodeBmp, i, subtable);
370                     }
371                 }
372                 _ => {}
373             }
374         }
375         Self {
376             mapping_index,
377             codepoint_subtable,
378             variant_subtable,
379         }
380     }
381 }
382 
383 #[cfg(test)]
384 mod tests {
385     use super::*;
386     use crate::MetadataProvider;
387     use read_fonts::FontRef;
388 
389     #[test]
choose_format_12_over_4()390     fn choose_format_12_over_4() {
391         let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
392         let charmap = font.charmap();
393         assert!(matches!(
394             charmap.codepoint_subtable.unwrap().subtable,
395             SupportedSubtable::Format12(..)
396         ));
397     }
398 
399     #[test]
choose_format_4()400     fn choose_format_4() {
401         let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
402         let charmap = font.charmap();
403         assert!(matches!(
404             charmap.codepoint_subtable.unwrap().subtable,
405             SupportedSubtable::Format4(..)
406         ));
407     }
408 
409     #[test]
choose_symbol()410     fn choose_symbol() {
411         let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
412         let charmap = font.charmap();
413         assert!(charmap.is_symbol());
414         assert!(matches!(
415             charmap.codepoint_subtable.unwrap().subtable,
416             SupportedSubtable::Format4(..)
417         ));
418     }
419 
420     #[test]
map_format_4()421     fn map_format_4() {
422         let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
423         let charmap = font.charmap();
424         assert_eq!(charmap.map('A'), Some(GlyphId::new(1)));
425         assert_eq!(charmap.map('À'), Some(GlyphId::new(2)));
426         assert_eq!(charmap.map('`'), Some(GlyphId::new(3)));
427         assert_eq!(charmap.map('B'), None);
428     }
429 
430     #[test]
map_format_12()431     fn map_format_12() {
432         let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
433         let charmap = font.charmap();
434         assert_eq!(charmap.map(' '), None);
435         assert_eq!(charmap.map(0x101723_u32), Some(GlyphId::new(1)));
436         assert_eq!(charmap.map(0x101725_u32), Some(GlyphId::new(3)));
437         assert_eq!(charmap.map(0x102523_u32), Some(GlyphId::new(6)));
438         assert_eq!(charmap.map(0x102526_u32), Some(GlyphId::new(9)));
439         assert_eq!(charmap.map(0x102527_u32), Some(GlyphId::new(10)));
440     }
441 
442     #[test]
map_symbol_pua()443     fn map_symbol_pua() {
444         let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
445         let charmap = font.charmap();
446         assert!(charmap.codepoint_subtable.as_ref().unwrap().is_symbol);
447         assert_eq!(charmap.map(0xF001_u32), Some(GlyphId::new(1)));
448         assert_eq!(charmap.map(0xF002_u32), Some(GlyphId::new(2)));
449         assert_eq!(charmap.map(0xF003_u32), Some(GlyphId::new(3)));
450         assert_eq!(charmap.map(0xF0FE_u32), Some(GlyphId::new(4)));
451         // The following don't exist in the cmap table and are remapped into the U+F000..F0FF range
452         // due to the selection of a symbol mapping subtable.
453         assert_eq!(charmap.map(0x1_u32), Some(GlyphId::new(1)));
454         assert_eq!(charmap.map(0x2_u32), Some(GlyphId::new(2)));
455         assert_eq!(charmap.map(0x3_u32), Some(GlyphId::new(3)));
456         assert_eq!(charmap.map(0xFE_u32), Some(GlyphId::new(4)));
457     }
458 
459     #[test]
map_variants()460     fn map_variants() {
461         use super::MapVariant::*;
462         let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
463         let charmap = font.charmap();
464         let selector = '\u{e0100}';
465         assert_eq!(charmap.map_variant('a', selector), None);
466         assert_eq!(charmap.map_variant('\u{4e00}', selector), Some(UseDefault));
467         assert_eq!(charmap.map_variant('\u{4e06}', selector), Some(UseDefault));
468         assert_eq!(
469             charmap.map_variant('\u{4e08}', selector),
470             Some(Variant(GlyphId::new(25)))
471         );
472         assert_eq!(
473             charmap.map_variant('\u{4e09}', selector),
474             Some(Variant(GlyphId::new(26)))
475         );
476     }
477 
478     #[test]
mappings()479     fn mappings() {
480         for font_data in [
481             font_test_data::VAZIRMATN_VAR,
482             font_test_data::CMAP12_FONT1,
483             font_test_data::SIMPLE_GLYF,
484             font_test_data::CMAP4_SYMBOL_PUA,
485         ] {
486             let font = FontRef::new(font_data).unwrap();
487             let charmap = font.charmap();
488             for (codepoint, glyph_id) in charmap.mappings() {
489                 assert_eq!(charmap.map(codepoint), Some(glyph_id));
490             }
491         }
492     }
493 
494     #[test]
variant_mappings()495     fn variant_mappings() {
496         let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
497         let charmap = font.charmap();
498         for (codepoint, selector, variant) in charmap.variant_mappings() {
499             assert_eq!(charmap.map_variant(codepoint, selector), Some(variant));
500         }
501     }
502 }
503