xref: /aosp_15_r20/external/fonttools/Lib/fontTools/ttLib/tables/_c_m_a_p.py (revision e1fe3e4ad2793916b15cccdc4a7da52a7e1dd0e9)
1from fontTools.misc.textTools import bytesjoin, safeEval, readHex
2from fontTools.misc.encodingTools import getEncoding
3from fontTools.ttLib import getSearchRange
4from fontTools.unicode import Unicode
5from . import DefaultTable
6import sys
7import struct
8import array
9import logging
10
11
12log = logging.getLogger(__name__)
13
14
15def _make_map(font, chars, gids):
16    assert len(chars) == len(gids)
17    glyphNames = font.getGlyphNameMany(gids)
18    cmap = {}
19    for char, gid, name in zip(chars, gids, glyphNames):
20        if gid == 0:
21            continue
22        cmap[char] = name
23    return cmap
24
25
26class table__c_m_a_p(DefaultTable.DefaultTable):
27    """Character to Glyph Index Mapping Table
28
29    This class represents the `cmap <https://docs.microsoft.com/en-us/typography/opentype/spec/cmap>`_
30    table, which maps between input characters (in Unicode or other system encodings)
31    and glyphs within the font. The ``cmap`` table contains one or more subtables
32    which determine the mapping of of characters to glyphs across different platforms
33    and encoding systems.
34
35    ``table__c_m_a_p`` objects expose an accessor ``.tables`` which provides access
36    to the subtables, although it is normally easier to retrieve individual subtables
37    through the utility methods described below. To add new subtables to a font,
38    first determine the subtable format (if in doubt use format 4 for glyphs within
39    the BMP, format 12 for glyphs outside the BMP, and format 14 for Unicode Variation
40    Sequences) construct subtable objects with ``CmapSubtable.newSubtable(format)``,
41    and append them to the ``.tables`` list.
42
43    Within a subtable, the mapping of characters to glyphs is provided by the ``.cmap``
44    attribute.
45
46    Example::
47
48            cmap4_0_3 = CmapSubtable.newSubtable(4)
49            cmap4_0_3.platformID = 0
50            cmap4_0_3.platEncID = 3
51            cmap4_0_3.language = 0
52            cmap4_0_3.cmap = { 0xC1: "Aacute" }
53
54            cmap = newTable("cmap")
55            cmap.tableVersion = 0
56            cmap.tables = [cmap4_0_3]
57    """
58
59    def getcmap(self, platformID, platEncID):
60        """Returns the first subtable which matches the given platform and encoding.
61
62        Args:
63                platformID (int): The platform ID. Use 0 for Unicode, 1 for Macintosh
64                        (deprecated for new fonts), 2 for ISO (deprecated) and 3 for Windows.
65                encodingID (int): Encoding ID. Interpretation depends on the platform ID.
66                        See the OpenType specification for details.
67
68        Returns:
69                An object which is a subclass of :py:class:`CmapSubtable` if a matching
70                subtable is found within the font, or ``None`` otherwise.
71        """
72
73        for subtable in self.tables:
74            if subtable.platformID == platformID and subtable.platEncID == platEncID:
75                return subtable
76        return None  # not found
77
78    def getBestCmap(
79        self,
80        cmapPreferences=(
81            (3, 10),
82            (0, 6),
83            (0, 4),
84            (3, 1),
85            (0, 3),
86            (0, 2),
87            (0, 1),
88            (0, 0),
89        ),
90    ):
91        """Returns the 'best' Unicode cmap dictionary available in the font
92        or ``None``, if no Unicode cmap subtable is available.
93
94        By default it will search for the following (platformID, platEncID)
95        pairs in order::
96
97                        (3, 10), # Windows Unicode full repertoire
98                        (0, 6),  # Unicode full repertoire (format 13 subtable)
99                        (0, 4),  # Unicode 2.0 full repertoire
100                        (3, 1),  # Windows Unicode BMP
101                        (0, 3),  # Unicode 2.0 BMP
102                        (0, 2),  # Unicode ISO/IEC 10646
103                        (0, 1),  # Unicode 1.1
104                        (0, 0)   # Unicode 1.0
105
106        This particular order matches what HarfBuzz uses to choose what
107        subtable to use by default. This order prefers the largest-repertoire
108        subtable, and among those, prefers the Windows-platform over the
109        Unicode-platform as the former has wider support.
110
111        This order can be customized via the ``cmapPreferences`` argument.
112        """
113        for platformID, platEncID in cmapPreferences:
114            cmapSubtable = self.getcmap(platformID, platEncID)
115            if cmapSubtable is not None:
116                return cmapSubtable.cmap
117        return None  # None of the requested cmap subtables were found
118
119    def buildReversed(self):
120        """Builds a reverse mapping dictionary
121
122        Iterates over all Unicode cmap tables and returns a dictionary mapping
123        glyphs to sets of codepoints, such as::
124
125                {
126                        'one': {0x31}
127                        'A': {0x41,0x391}
128                }
129
130        The values are sets of Unicode codepoints because
131        some fonts map different codepoints to the same glyph.
132        For example, ``U+0041 LATIN CAPITAL LETTER A`` and ``U+0391
133        GREEK CAPITAL LETTER ALPHA`` are sometimes the same glyph.
134        """
135        result = {}
136        for subtable in self.tables:
137            if subtable.isUnicode():
138                for codepoint, name in subtable.cmap.items():
139                    result.setdefault(name, set()).add(codepoint)
140        return result
141
142    def decompile(self, data, ttFont):
143        tableVersion, numSubTables = struct.unpack(">HH", data[:4])
144        self.tableVersion = int(tableVersion)
145        self.tables = tables = []
146        seenOffsets = {}
147        for i in range(numSubTables):
148            platformID, platEncID, offset = struct.unpack(
149                ">HHl", data[4 + i * 8 : 4 + (i + 1) * 8]
150            )
151            platformID, platEncID = int(platformID), int(platEncID)
152            format, length = struct.unpack(">HH", data[offset : offset + 4])
153            if format in [8, 10, 12, 13]:
154                format, reserved, length = struct.unpack(
155                    ">HHL", data[offset : offset + 8]
156                )
157            elif format in [14]:
158                format, length = struct.unpack(">HL", data[offset : offset + 6])
159
160            if not length:
161                log.error(
162                    "cmap subtable is reported as having zero length: platformID %s, "
163                    "platEncID %s, format %s offset %s. Skipping table.",
164                    platformID,
165                    platEncID,
166                    format,
167                    offset,
168                )
169                continue
170            table = CmapSubtable.newSubtable(format)
171            table.platformID = platformID
172            table.platEncID = platEncID
173            # Note that by default we decompile only the subtable header info;
174            # any other data gets decompiled only when an attribute of the
175            # subtable is referenced.
176            table.decompileHeader(data[offset : offset + int(length)], ttFont)
177            if offset in seenOffsets:
178                table.data = None  # Mark as decompiled
179                table.cmap = tables[seenOffsets[offset]].cmap
180            else:
181                seenOffsets[offset] = i
182            tables.append(table)
183        if ttFont.lazy is False:  # Be lazy for None and True
184            self.ensureDecompiled()
185
186    def ensureDecompiled(self, recurse=False):
187        # The recurse argument is unused, but part of the signature of
188        # ensureDecompiled across the library.
189        for st in self.tables:
190            st.ensureDecompiled()
191
192    def compile(self, ttFont):
193        self.tables.sort()  # sort according to the spec; see CmapSubtable.__lt__()
194        numSubTables = len(self.tables)
195        totalOffset = 4 + 8 * numSubTables
196        data = struct.pack(">HH", self.tableVersion, numSubTables)
197        tableData = b""
198        seen = (
199            {}
200        )  # Some tables are the same object reference. Don't compile them twice.
201        done = (
202            {}
203        )  # Some tables are different objects, but compile to the same data chunk
204        for table in self.tables:
205            offset = seen.get(id(table.cmap))
206            if offset is None:
207                chunk = table.compile(ttFont)
208                offset = done.get(chunk)
209                if offset is None:
210                    offset = seen[id(table.cmap)] = done[chunk] = totalOffset + len(
211                        tableData
212                    )
213                    tableData = tableData + chunk
214            data = data + struct.pack(">HHl", table.platformID, table.platEncID, offset)
215        return data + tableData
216
217    def toXML(self, writer, ttFont):
218        writer.simpletag("tableVersion", version=self.tableVersion)
219        writer.newline()
220        for table in self.tables:
221            table.toXML(writer, ttFont)
222
223    def fromXML(self, name, attrs, content, ttFont):
224        if name == "tableVersion":
225            self.tableVersion = safeEval(attrs["version"])
226            return
227        if name[:12] != "cmap_format_":
228            return
229        if not hasattr(self, "tables"):
230            self.tables = []
231        format = safeEval(name[12:])
232        table = CmapSubtable.newSubtable(format)
233        table.platformID = safeEval(attrs["platformID"])
234        table.platEncID = safeEval(attrs["platEncID"])
235        table.fromXML(name, attrs, content, ttFont)
236        self.tables.append(table)
237
238
239class CmapSubtable(object):
240    """Base class for all cmap subtable formats.
241
242    Subclasses which handle the individual subtable formats are named
243    ``cmap_format_0``, ``cmap_format_2`` etc. Use :py:meth:`getSubtableClass`
244    to retrieve the concrete subclass, or :py:meth:`newSubtable` to get a
245    new subtable object for a given format.
246
247    The object exposes a ``.cmap`` attribute, which contains a dictionary mapping
248    character codepoints to glyph names.
249    """
250
251    @staticmethod
252    def getSubtableClass(format):
253        """Return the subtable class for a format."""
254        return cmap_classes.get(format, cmap_format_unknown)
255
256    @staticmethod
257    def newSubtable(format):
258        """Return a new instance of a subtable for the given format
259        ."""
260        subtableClass = CmapSubtable.getSubtableClass(format)
261        return subtableClass(format)
262
263    def __init__(self, format):
264        self.format = format
265        self.data = None
266        self.ttFont = None
267        self.platformID = None  #: The platform ID of this subtable
268        self.platEncID = None  #: The encoding ID of this subtable (interpretation depends on ``platformID``)
269        self.language = (
270            None  #: The language ID of this subtable (Macintosh platform only)
271        )
272
273    def ensureDecompiled(self, recurse=False):
274        # The recurse argument is unused, but part of the signature of
275        # ensureDecompiled across the library.
276        if self.data is None:
277            return
278        self.decompile(None, None)  # use saved data.
279        self.data = None  # Once this table has been decompiled, make sure we don't
280        # just return the original data. Also avoids recursion when
281        # called with an attribute that the cmap subtable doesn't have.
282
283    def __getattr__(self, attr):
284        # allow lazy decompilation of subtables.
285        if attr[:2] == "__":  # don't handle requests for member functions like '__lt__'
286            raise AttributeError(attr)
287        if self.data is None:
288            raise AttributeError(attr)
289        self.ensureDecompiled()
290        return getattr(self, attr)
291
292    def decompileHeader(self, data, ttFont):
293        format, length, language = struct.unpack(">HHH", data[:6])
294        assert (
295            len(data) == length
296        ), "corrupt cmap table format %d (data length: %d, header length: %d)" % (
297            format,
298            len(data),
299            length,
300        )
301        self.format = int(format)
302        self.length = int(length)
303        self.language = int(language)
304        self.data = data[6:]
305        self.ttFont = ttFont
306
307    def toXML(self, writer, ttFont):
308        writer.begintag(
309            self.__class__.__name__,
310            [
311                ("platformID", self.platformID),
312                ("platEncID", self.platEncID),
313                ("language", self.language),
314            ],
315        )
316        writer.newline()
317        codes = sorted(self.cmap.items())
318        self._writeCodes(codes, writer)
319        writer.endtag(self.__class__.__name__)
320        writer.newline()
321
322    def getEncoding(self, default=None):
323        """Returns the Python encoding name for this cmap subtable based on its platformID,
324        platEncID, and language.  If encoding for these values is not known, by default
325        ``None`` is returned.  That can be overridden by passing a value to the ``default``
326        argument.
327
328        Note that if you want to choose a "preferred" cmap subtable, most of the time
329        ``self.isUnicode()`` is what you want as that one only returns true for the modern,
330        commonly used, Unicode-compatible triplets, not the legacy ones.
331        """
332        return getEncoding(self.platformID, self.platEncID, self.language, default)
333
334    def isUnicode(self):
335        """Returns true if the characters are interpreted as Unicode codepoints."""
336        return self.platformID == 0 or (
337            self.platformID == 3 and self.platEncID in [0, 1, 10]
338        )
339
340    def isSymbol(self):
341        """Returns true if the subtable is for the Symbol encoding (3,0)"""
342        return self.platformID == 3 and self.platEncID == 0
343
344    def _writeCodes(self, codes, writer):
345        isUnicode = self.isUnicode()
346        for code, name in codes:
347            writer.simpletag("map", code=hex(code), name=name)
348            if isUnicode:
349                writer.comment(Unicode[code])
350            writer.newline()
351
352    def __lt__(self, other):
353        if not isinstance(other, CmapSubtable):
354            return NotImplemented
355
356        # implemented so that list.sort() sorts according to the spec.
357        selfTuple = (
358            getattr(self, "platformID", None),
359            getattr(self, "platEncID", None),
360            getattr(self, "language", None),
361            self.__dict__,
362        )
363        otherTuple = (
364            getattr(other, "platformID", None),
365            getattr(other, "platEncID", None),
366            getattr(other, "language", None),
367            other.__dict__,
368        )
369        return selfTuple < otherTuple
370
371
372class cmap_format_0(CmapSubtable):
373    def decompile(self, data, ttFont):
374        # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
375        # If not, someone is calling the subtable decompile() directly, and must provide both args.
376        if data is not None and ttFont is not None:
377            self.decompileHeader(data, ttFont)
378        else:
379            assert (
380                data is None and ttFont is None
381            ), "Need both data and ttFont arguments"
382        data = (
383            self.data
384        )  # decompileHeader assigns the data after the header to self.data
385        assert 262 == self.length, "Format 0 cmap subtable not 262 bytes"
386        gids = array.array("B")
387        gids.frombytes(self.data)
388        charCodes = list(range(len(gids)))
389        self.cmap = _make_map(self.ttFont, charCodes, gids)
390
391    def compile(self, ttFont):
392        if self.data:
393            return struct.pack(">HHH", 0, 262, self.language) + self.data
394
395        cmap = self.cmap
396        assert set(cmap.keys()).issubset(range(256))
397        getGlyphID = ttFont.getGlyphID
398        valueList = [getGlyphID(cmap[i]) if i in cmap else 0 for i in range(256)]
399
400        gids = array.array("B", valueList)
401        data = struct.pack(">HHH", 0, 262, self.language) + gids.tobytes()
402        assert len(data) == 262
403        return data
404
405    def fromXML(self, name, attrs, content, ttFont):
406        self.language = safeEval(attrs["language"])
407        if not hasattr(self, "cmap"):
408            self.cmap = {}
409        cmap = self.cmap
410        for element in content:
411            if not isinstance(element, tuple):
412                continue
413            name, attrs, content = element
414            if name != "map":
415                continue
416            cmap[safeEval(attrs["code"])] = attrs["name"]
417
418
419subHeaderFormat = ">HHhH"
420
421
422class SubHeader(object):
423    def __init__(self):
424        self.firstCode = None
425        self.entryCount = None
426        self.idDelta = None
427        self.idRangeOffset = None
428        self.glyphIndexArray = []
429
430
431class cmap_format_2(CmapSubtable):
432    def setIDDelta(self, subHeader):
433        subHeader.idDelta = 0
434        # find the minGI which is not zero.
435        minGI = subHeader.glyphIndexArray[0]
436        for gid in subHeader.glyphIndexArray:
437            if (gid != 0) and (gid < minGI):
438                minGI = gid
439        # The lowest gid in glyphIndexArray, after subtracting idDelta, must be 1.
440        # idDelta is a short, and must be between -32K and 32K. minGI can be between 1 and 64K.
441        # We would like to pick an idDelta such that the first glyphArray GID is 1,
442        # so that we are more likely to be able to combine glypharray GID subranges.
443        # This means that we have a problem when minGI is > 32K
444        # Since the final gi is reconstructed from the glyphArray GID by:
445        #    (short)finalGID = (gid + idDelta) % 0x10000),
446        # we can get from a glypharray GID of 1 to a final GID of 65K by subtracting 2, and casting the
447        # negative number to an unsigned short.
448
449        if minGI > 1:
450            if minGI > 0x7FFF:
451                subHeader.idDelta = -(0x10000 - minGI) - 1
452            else:
453                subHeader.idDelta = minGI - 1
454            idDelta = subHeader.idDelta
455            for i in range(subHeader.entryCount):
456                gid = subHeader.glyphIndexArray[i]
457                if gid > 0:
458                    subHeader.glyphIndexArray[i] = gid - idDelta
459
460    def decompile(self, data, ttFont):
461        # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
462        # If not, someone is calling the subtable decompile() directly, and must provide both args.
463        if data is not None and ttFont is not None:
464            self.decompileHeader(data, ttFont)
465        else:
466            assert (
467                data is None and ttFont is None
468            ), "Need both data and ttFont arguments"
469
470        data = (
471            self.data
472        )  # decompileHeader assigns the data after the header to self.data
473        subHeaderKeys = []
474        maxSubHeaderindex = 0
475        # get the key array, and determine the number of subHeaders.
476        allKeys = array.array("H")
477        allKeys.frombytes(data[:512])
478        data = data[512:]
479        if sys.byteorder != "big":
480            allKeys.byteswap()
481        subHeaderKeys = [key // 8 for key in allKeys]
482        maxSubHeaderindex = max(subHeaderKeys)
483
484        # Load subHeaders
485        subHeaderList = []
486        pos = 0
487        for i in range(maxSubHeaderindex + 1):
488            subHeader = SubHeader()
489            (
490                subHeader.firstCode,
491                subHeader.entryCount,
492                subHeader.idDelta,
493                subHeader.idRangeOffset,
494            ) = struct.unpack(subHeaderFormat, data[pos : pos + 8])
495            pos += 8
496            giDataPos = pos + subHeader.idRangeOffset - 2
497            giList = array.array("H")
498            giList.frombytes(data[giDataPos : giDataPos + subHeader.entryCount * 2])
499            if sys.byteorder != "big":
500                giList.byteswap()
501            subHeader.glyphIndexArray = giList
502            subHeaderList.append(subHeader)
503        # How this gets processed.
504        # Charcodes may be one or two bytes.
505        # The first byte of a charcode is mapped through the subHeaderKeys, to select
506        # a subHeader. For any subheader but 0, the next byte is then mapped through the
507        # selected subheader. If subheader Index 0 is selected, then the byte itself is
508        # mapped through the subheader, and there is no second byte.
509        # Then assume that the subsequent byte is the first byte of the next charcode,and repeat.
510        #
511        # Each subheader references a range in the glyphIndexArray whose length is entryCount.
512        # The range in glyphIndexArray referenced by a sunheader may overlap with the range in glyphIndexArray
513        # referenced by another subheader.
514        # The only subheader that will be referenced by more than one first-byte value is the subheader
515        # that maps the entire range of glyphID values to glyphIndex 0, e.g notdef:
516        # 	 {firstChar 0, EntryCount 0,idDelta 0,idRangeOffset xx}
517        # A byte being mapped though a subheader is treated as in index into a mapping of array index to font glyphIndex.
518        # A subheader specifies a subrange within (0...256) by the
519        # firstChar and EntryCount values. If the byte value is outside the subrange, then the glyphIndex is zero
520        # (e.g. glyph not in font).
521        # If the byte index is in the subrange, then an offset index is calculated as (byteIndex - firstChar).
522        # The index to glyphIndex mapping is a subrange of the glyphIndexArray. You find the start of the subrange by
523        # counting idRangeOffset bytes from the idRangeOffset word. The first value in this subrange is the
524        # glyphIndex for the index firstChar. The offset index should then be used in this array to get the glyphIndex.
525        # Example for Logocut-Medium
526        # first byte of charcode = 129; selects subheader 1.
527        # subheader 1 = {firstChar 64, EntryCount 108,idDelta 42,idRangeOffset 0252}
528        # second byte of charCode = 66
529        # the index offset = 66-64 = 2.
530        # The subrange of the glyphIndexArray starting at 0x0252 bytes from the idRangeOffset word is:
531        # [glyphIndexArray index], [subrange array index] = glyphIndex
532        # [256], [0]=1 	from charcode [129, 64]
533        # [257], [1]=2  	from charcode [129, 65]
534        # [258], [2]=3  	from charcode [129, 66]
535        # [259], [3]=4  	from charcode [129, 67]
536        # So, the glyphIndex = 3 from the array. Then if idDelta is not zero and the glyph ID is not zero,
537        # add it to the glyphID to get the final glyphIndex
538        # value. In this case the final glyph index = 3+ 42 -> 45 for the final glyphIndex. Whew!
539
540        self.data = b""
541        cmap = {}
542        notdefGI = 0
543        for firstByte in range(256):
544            subHeadindex = subHeaderKeys[firstByte]
545            subHeader = subHeaderList[subHeadindex]
546            if subHeadindex == 0:
547                if (firstByte < subHeader.firstCode) or (
548                    firstByte >= subHeader.firstCode + subHeader.entryCount
549                ):
550                    continue  # gi is notdef.
551                else:
552                    charCode = firstByte
553                    offsetIndex = firstByte - subHeader.firstCode
554                    gi = subHeader.glyphIndexArray[offsetIndex]
555                    if gi != 0:
556                        gi = (gi + subHeader.idDelta) % 0x10000
557                    else:
558                        continue  # gi is notdef.
559                cmap[charCode] = gi
560            else:
561                if subHeader.entryCount:
562                    charCodeOffset = firstByte * 256 + subHeader.firstCode
563                    for offsetIndex in range(subHeader.entryCount):
564                        charCode = charCodeOffset + offsetIndex
565                        gi = subHeader.glyphIndexArray[offsetIndex]
566                        if gi != 0:
567                            gi = (gi + subHeader.idDelta) % 0x10000
568                        else:
569                            continue
570                        cmap[charCode] = gi
571                # If not subHeader.entryCount, then all char codes with this first byte are
572                # mapped to .notdef. We can skip this subtable, and leave the glyphs un-encoded, which is the
573                # same as mapping it to .notdef.
574
575        gids = list(cmap.values())
576        charCodes = list(cmap.keys())
577        self.cmap = _make_map(self.ttFont, charCodes, gids)
578
579    def compile(self, ttFont):
580        if self.data:
581            return (
582                struct.pack(">HHH", self.format, self.length, self.language) + self.data
583            )
584        kEmptyTwoCharCodeRange = -1
585        notdefGI = 0
586
587        items = sorted(self.cmap.items())
588        charCodes = [item[0] for item in items]
589        names = [item[1] for item in items]
590        nameMap = ttFont.getReverseGlyphMap()
591        try:
592            gids = [nameMap[name] for name in names]
593        except KeyError:
594            nameMap = ttFont.getReverseGlyphMap(rebuild=True)
595            try:
596                gids = [nameMap[name] for name in names]
597            except KeyError:
598                # allow virtual GIDs in format 2 tables
599                gids = []
600                for name in names:
601                    try:
602                        gid = nameMap[name]
603                    except KeyError:
604                        try:
605                            if name[:3] == "gid":
606                                gid = int(name[3:])
607                            else:
608                                gid = ttFont.getGlyphID(name)
609                        except:
610                            raise KeyError(name)
611
612                    gids.append(gid)
613
614        # Process the (char code to gid) item list in char code order.
615        # By definition, all one byte char codes map to subheader 0.
616        # For all the two byte char codes, we assume that the first byte maps maps to the empty subhead (with an entry count of 0,
617        # which defines all char codes in its range to map to notdef) unless proven otherwise.
618        # Note that since the char code items are processed in char code order, all the char codes with the
619        # same first byte are in sequential order.
620
621        subHeaderKeys = [
622            kEmptyTwoCharCodeRange for x in range(256)
623        ]  # list of indices into subHeaderList.
624        subHeaderList = []
625
626        # We force this subheader entry 0 to exist in the subHeaderList in the case where some one comes up
627        # with a cmap where all the one byte char codes map to notdef,
628        # with the result that the subhead 0 would not get created just by processing the item list.
629        charCode = charCodes[0]
630        if charCode > 255:
631            subHeader = SubHeader()
632            subHeader.firstCode = 0
633            subHeader.entryCount = 0
634            subHeader.idDelta = 0
635            subHeader.idRangeOffset = 0
636            subHeaderList.append(subHeader)
637
638        lastFirstByte = -1
639        items = zip(charCodes, gids)
640        for charCode, gid in items:
641            if gid == 0:
642                continue
643            firstbyte = charCode >> 8
644            secondByte = charCode & 0x00FF
645
646            if (
647                firstbyte != lastFirstByte
648            ):  # Need to update the current subhead, and start a new one.
649                if lastFirstByte > -1:
650                    # fix GI's and iDelta of current subheader.
651                    self.setIDDelta(subHeader)
652
653                    # If it was sunheader 0 for one-byte charCodes, then we need to set the subHeaderKeys value to zero
654                    # for the indices matching the char codes.
655                    if lastFirstByte == 0:
656                        for index in range(subHeader.entryCount):
657                            charCode = subHeader.firstCode + index
658                            subHeaderKeys[charCode] = 0
659
660                    assert subHeader.entryCount == len(
661                        subHeader.glyphIndexArray
662                    ), "Error - subhead entry count does not match len of glyphID subrange."
663                # init new subheader
664                subHeader = SubHeader()
665                subHeader.firstCode = secondByte
666                subHeader.entryCount = 1
667                subHeader.glyphIndexArray.append(gid)
668                subHeaderList.append(subHeader)
669                subHeaderKeys[firstbyte] = len(subHeaderList) - 1
670                lastFirstByte = firstbyte
671            else:
672                # need to fill in with notdefs all the code points between the last charCode and the current charCode.
673                codeDiff = secondByte - (subHeader.firstCode + subHeader.entryCount)
674                for i in range(codeDiff):
675                    subHeader.glyphIndexArray.append(notdefGI)
676                subHeader.glyphIndexArray.append(gid)
677                subHeader.entryCount = subHeader.entryCount + codeDiff + 1
678
679        # fix GI's and iDelta of last subheader that we we added to the subheader array.
680        self.setIDDelta(subHeader)
681
682        # Now we add a final subheader for the subHeaderKeys which maps to empty two byte charcode ranges.
683        subHeader = SubHeader()
684        subHeader.firstCode = 0
685        subHeader.entryCount = 0
686        subHeader.idDelta = 0
687        subHeader.idRangeOffset = 2
688        subHeaderList.append(subHeader)
689        emptySubheadIndex = len(subHeaderList) - 1
690        for index in range(256):
691            if subHeaderKeys[index] == kEmptyTwoCharCodeRange:
692                subHeaderKeys[index] = emptySubheadIndex
693        # Since this is the last subheader, the GlyphIndex Array starts two bytes after the start of the
694        # idRangeOffset word of this subHeader. We can safely point to the first entry in the GlyphIndexArray,
695        # since the first subrange of the GlyphIndexArray is for subHeader 0, which always starts with
696        # charcode 0 and GID 0.
697
698        idRangeOffset = (
699            len(subHeaderList) - 1
700        ) * 8 + 2  # offset to beginning of glyphIDArray from first subheader idRangeOffset.
701        subheadRangeLen = (
702            len(subHeaderList) - 1
703        )  # skip last special empty-set subheader; we've already hardocodes its idRangeOffset to 2.
704        for index in range(subheadRangeLen):
705            subHeader = subHeaderList[index]
706            subHeader.idRangeOffset = 0
707            for j in range(index):
708                prevSubhead = subHeaderList[j]
709                if (
710                    prevSubhead.glyphIndexArray == subHeader.glyphIndexArray
711                ):  # use the glyphIndexArray subarray
712                    subHeader.idRangeOffset = (
713                        prevSubhead.idRangeOffset - (index - j) * 8
714                    )
715                    subHeader.glyphIndexArray = []
716                    break
717            if subHeader.idRangeOffset == 0:  # didn't find one.
718                subHeader.idRangeOffset = idRangeOffset
719                idRangeOffset = (
720                    idRangeOffset - 8
721                ) + subHeader.entryCount * 2  # one less subheader, one more subArray.
722            else:
723                idRangeOffset = idRangeOffset - 8  # one less subheader
724
725        # Now we can write out the data!
726        length = (
727            6 + 512 + 8 * len(subHeaderList)
728        )  # header, 256 subHeaderKeys, and subheader array.
729        for subhead in subHeaderList[:-1]:
730            length = (
731                length + len(subhead.glyphIndexArray) * 2
732            )  # We can't use subhead.entryCount, as some of the subhead may share subArrays.
733        dataList = [struct.pack(">HHH", 2, length, self.language)]
734        for index in subHeaderKeys:
735            dataList.append(struct.pack(">H", index * 8))
736        for subhead in subHeaderList:
737            dataList.append(
738                struct.pack(
739                    subHeaderFormat,
740                    subhead.firstCode,
741                    subhead.entryCount,
742                    subhead.idDelta,
743                    subhead.idRangeOffset,
744                )
745            )
746        for subhead in subHeaderList[:-1]:
747            for gi in subhead.glyphIndexArray:
748                dataList.append(struct.pack(">H", gi))
749        data = bytesjoin(dataList)
750        assert len(data) == length, (
751            "Error: cmap format 2 is not same length as calculated! actual: "
752            + str(len(data))
753            + " calc : "
754            + str(length)
755        )
756        return data
757
758    def fromXML(self, name, attrs, content, ttFont):
759        self.language = safeEval(attrs["language"])
760        if not hasattr(self, "cmap"):
761            self.cmap = {}
762        cmap = self.cmap
763
764        for element in content:
765            if not isinstance(element, tuple):
766                continue
767            name, attrs, content = element
768            if name != "map":
769                continue
770            cmap[safeEval(attrs["code"])] = attrs["name"]
771
772
773cmap_format_4_format = ">7H"
774
775# uint16  endCode[segCount]          # Ending character code for each segment, last = 0xFFFF.
776# uint16  reservedPad                # This value should be zero
777# uint16  startCode[segCount]        # Starting character code for each segment
778# uint16  idDelta[segCount]          # Delta for all character codes in segment
779# uint16  idRangeOffset[segCount]    # Offset in bytes to glyph indexArray, or 0
780# uint16  glyphIndexArray[variable]  # Glyph index array
781
782
783def splitRange(startCode, endCode, cmap):
784    # Try to split a range of character codes into subranges with consecutive
785    # glyph IDs in such a way that the cmap4 subtable can be stored "most"
786    # efficiently. I can't prove I've got the optimal solution, but it seems
787    # to do well with the fonts I tested: none became bigger, many became smaller.
788    if startCode == endCode:
789        return [], [endCode]
790
791    lastID = cmap[startCode]
792    lastCode = startCode
793    inOrder = None
794    orderedBegin = None
795    subRanges = []
796
797    # Gather subranges in which the glyph IDs are consecutive.
798    for code in range(startCode + 1, endCode + 1):
799        glyphID = cmap[code]
800
801        if glyphID - 1 == lastID:
802            if inOrder is None or not inOrder:
803                inOrder = 1
804                orderedBegin = lastCode
805        else:
806            if inOrder:
807                inOrder = 0
808                subRanges.append((orderedBegin, lastCode))
809                orderedBegin = None
810
811        lastID = glyphID
812        lastCode = code
813
814    if inOrder:
815        subRanges.append((orderedBegin, lastCode))
816    assert lastCode == endCode
817
818    # Now filter out those new subranges that would only make the data bigger.
819    # A new segment cost 8 bytes, not using a new segment costs 2 bytes per
820    # character.
821    newRanges = []
822    for b, e in subRanges:
823        if b == startCode and e == endCode:
824            break  # the whole range, we're fine
825        if b == startCode or e == endCode:
826            threshold = 4  # split costs one more segment
827        else:
828            threshold = 8  # split costs two more segments
829        if (e - b + 1) > threshold:
830            newRanges.append((b, e))
831    subRanges = newRanges
832
833    if not subRanges:
834        return [], [endCode]
835
836    if subRanges[0][0] != startCode:
837        subRanges.insert(0, (startCode, subRanges[0][0] - 1))
838    if subRanges[-1][1] != endCode:
839        subRanges.append((subRanges[-1][1] + 1, endCode))
840
841    # Fill the "holes" in the segments list -- those are the segments in which
842    # the glyph IDs are _not_ consecutive.
843    i = 1
844    while i < len(subRanges):
845        if subRanges[i - 1][1] + 1 != subRanges[i][0]:
846            subRanges.insert(i, (subRanges[i - 1][1] + 1, subRanges[i][0] - 1))
847            i = i + 1
848        i = i + 1
849
850    # Transform the ranges into startCode/endCode lists.
851    start = []
852    end = []
853    for b, e in subRanges:
854        start.append(b)
855        end.append(e)
856    start.pop(0)
857
858    assert len(start) + 1 == len(end)
859    return start, end
860
861
862class cmap_format_4(CmapSubtable):
863    def decompile(self, data, ttFont):
864        # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
865        # If not, someone is calling the subtable decompile() directly, and must provide both args.
866        if data is not None and ttFont is not None:
867            self.decompileHeader(data, ttFont)
868        else:
869            assert (
870                data is None and ttFont is None
871            ), "Need both data and ttFont arguments"
872
873        data = (
874            self.data
875        )  # decompileHeader assigns the data after the header to self.data
876        (segCountX2, searchRange, entrySelector, rangeShift) = struct.unpack(
877            ">4H", data[:8]
878        )
879        data = data[8:]
880        segCount = segCountX2 // 2
881
882        allCodes = array.array("H")
883        allCodes.frombytes(data)
884        self.data = data = None
885
886        if sys.byteorder != "big":
887            allCodes.byteswap()
888
889        # divide the data
890        endCode = allCodes[:segCount]
891        allCodes = allCodes[segCount + 1 :]  # the +1 is skipping the reservedPad field
892        startCode = allCodes[:segCount]
893        allCodes = allCodes[segCount:]
894        idDelta = allCodes[:segCount]
895        allCodes = allCodes[segCount:]
896        idRangeOffset = allCodes[:segCount]
897        glyphIndexArray = allCodes[segCount:]
898        lenGIArray = len(glyphIndexArray)
899
900        # build 2-byte character mapping
901        charCodes = []
902        gids = []
903        for i in range(len(startCode) - 1):  # don't do 0xffff!
904            start = startCode[i]
905            delta = idDelta[i]
906            rangeOffset = idRangeOffset[i]
907            partial = rangeOffset // 2 - start + i - len(idRangeOffset)
908
909            rangeCharCodes = list(range(startCode[i], endCode[i] + 1))
910            charCodes.extend(rangeCharCodes)
911            if rangeOffset == 0:
912                gids.extend(
913                    [(charCode + delta) & 0xFFFF for charCode in rangeCharCodes]
914                )
915            else:
916                for charCode in rangeCharCodes:
917                    index = charCode + partial
918                    assert index < lenGIArray, (
919                        "In format 4 cmap, range (%d), the calculated index (%d) into the glyph index array is not less than the length of the array (%d) !"
920                        % (i, index, lenGIArray)
921                    )
922                    if glyphIndexArray[index] != 0:  # if not missing glyph
923                        glyphID = glyphIndexArray[index] + delta
924                    else:
925                        glyphID = 0  # missing glyph
926                    gids.append(glyphID & 0xFFFF)
927
928        self.cmap = _make_map(self.ttFont, charCodes, gids)
929
930    def compile(self, ttFont):
931        if self.data:
932            return (
933                struct.pack(">HHH", self.format, self.length, self.language) + self.data
934            )
935
936        charCodes = list(self.cmap.keys())
937        if not charCodes:
938            startCode = [0xFFFF]
939            endCode = [0xFFFF]
940        else:
941            charCodes.sort()
942            names = [self.cmap[code] for code in charCodes]
943            nameMap = ttFont.getReverseGlyphMap()
944            try:
945                gids = [nameMap[name] for name in names]
946            except KeyError:
947                nameMap = ttFont.getReverseGlyphMap(rebuild=True)
948                try:
949                    gids = [nameMap[name] for name in names]
950                except KeyError:
951                    # allow virtual GIDs in format 4 tables
952                    gids = []
953                    for name in names:
954                        try:
955                            gid = nameMap[name]
956                        except KeyError:
957                            try:
958                                if name[:3] == "gid":
959                                    gid = int(name[3:])
960                                else:
961                                    gid = ttFont.getGlyphID(name)
962                            except:
963                                raise KeyError(name)
964
965                        gids.append(gid)
966            cmap = {}  # code:glyphID mapping
967            for code, gid in zip(charCodes, gids):
968                cmap[code] = gid
969
970            # Build startCode and endCode lists.
971            # Split the char codes in ranges of consecutive char codes, then split
972            # each range in more ranges of consecutive/not consecutive glyph IDs.
973            # See splitRange().
974            lastCode = charCodes[0]
975            endCode = []
976            startCode = [lastCode]
977            for charCode in charCodes[
978                1:
979            ]:  # skip the first code, it's the first start code
980                if charCode == lastCode + 1:
981                    lastCode = charCode
982                    continue
983                start, end = splitRange(startCode[-1], lastCode, cmap)
984                startCode.extend(start)
985                endCode.extend(end)
986                startCode.append(charCode)
987                lastCode = charCode
988            start, end = splitRange(startCode[-1], lastCode, cmap)
989            startCode.extend(start)
990            endCode.extend(end)
991            startCode.append(0xFFFF)
992            endCode.append(0xFFFF)
993
994        # build up rest of cruft
995        idDelta = []
996        idRangeOffset = []
997        glyphIndexArray = []
998        for i in range(len(endCode) - 1):  # skip the closing codes (0xffff)
999            indices = []
1000            for charCode in range(startCode[i], endCode[i] + 1):
1001                indices.append(cmap[charCode])
1002            if indices == list(range(indices[0], indices[0] + len(indices))):
1003                idDelta.append((indices[0] - startCode[i]) % 0x10000)
1004                idRangeOffset.append(0)
1005            else:
1006                idDelta.append(0)
1007                idRangeOffset.append(2 * (len(endCode) + len(glyphIndexArray) - i))
1008                glyphIndexArray.extend(indices)
1009        idDelta.append(1)  # 0xffff + 1 == (tadaa!) 0. So this end code maps to .notdef
1010        idRangeOffset.append(0)
1011
1012        # Insane.
1013        segCount = len(endCode)
1014        segCountX2 = segCount * 2
1015        searchRange, entrySelector, rangeShift = getSearchRange(segCount, 2)
1016
1017        charCodeArray = array.array("H", endCode + [0] + startCode)
1018        idDeltaArray = array.array("H", idDelta)
1019        restArray = array.array("H", idRangeOffset + glyphIndexArray)
1020        if sys.byteorder != "big":
1021            charCodeArray.byteswap()
1022        if sys.byteorder != "big":
1023            idDeltaArray.byteswap()
1024        if sys.byteorder != "big":
1025            restArray.byteswap()
1026        data = charCodeArray.tobytes() + idDeltaArray.tobytes() + restArray.tobytes()
1027
1028        length = struct.calcsize(cmap_format_4_format) + len(data)
1029        header = struct.pack(
1030            cmap_format_4_format,
1031            self.format,
1032            length,
1033            self.language,
1034            segCountX2,
1035            searchRange,
1036            entrySelector,
1037            rangeShift,
1038        )
1039        return header + data
1040
1041    def fromXML(self, name, attrs, content, ttFont):
1042        self.language = safeEval(attrs["language"])
1043        if not hasattr(self, "cmap"):
1044            self.cmap = {}
1045        cmap = self.cmap
1046
1047        for element in content:
1048            if not isinstance(element, tuple):
1049                continue
1050            nameMap, attrsMap, dummyContent = element
1051            if nameMap != "map":
1052                assert 0, "Unrecognized keyword in cmap subtable"
1053            cmap[safeEval(attrsMap["code"])] = attrsMap["name"]
1054
1055
1056class cmap_format_6(CmapSubtable):
1057    def decompile(self, data, ttFont):
1058        # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
1059        # If not, someone is calling the subtable decompile() directly, and must provide both args.
1060        if data is not None and ttFont is not None:
1061            self.decompileHeader(data, ttFont)
1062        else:
1063            assert (
1064                data is None and ttFont is None
1065            ), "Need both data and ttFont arguments"
1066
1067        data = (
1068            self.data
1069        )  # decompileHeader assigns the data after the header to self.data
1070        firstCode, entryCount = struct.unpack(">HH", data[:4])
1071        firstCode = int(firstCode)
1072        data = data[4:]
1073        # assert len(data) == 2 * entryCount  # XXX not true in Apple's Helvetica!!!
1074        gids = array.array("H")
1075        gids.frombytes(data[: 2 * int(entryCount)])
1076        if sys.byteorder != "big":
1077            gids.byteswap()
1078        self.data = data = None
1079
1080        charCodes = list(range(firstCode, firstCode + len(gids)))
1081        self.cmap = _make_map(self.ttFont, charCodes, gids)
1082
1083    def compile(self, ttFont):
1084        if self.data:
1085            return (
1086                struct.pack(">HHH", self.format, self.length, self.language) + self.data
1087            )
1088        cmap = self.cmap
1089        codes = sorted(cmap.keys())
1090        if codes:  # yes, there are empty cmap tables.
1091            codes = list(range(codes[0], codes[-1] + 1))
1092            firstCode = codes[0]
1093            valueList = [
1094                ttFont.getGlyphID(cmap[code]) if code in cmap else 0 for code in codes
1095            ]
1096            gids = array.array("H", valueList)
1097            if sys.byteorder != "big":
1098                gids.byteswap()
1099            data = gids.tobytes()
1100        else:
1101            data = b""
1102            firstCode = 0
1103        header = struct.pack(
1104            ">HHHHH", 6, len(data) + 10, self.language, firstCode, len(codes)
1105        )
1106        return header + data
1107
1108    def fromXML(self, name, attrs, content, ttFont):
1109        self.language = safeEval(attrs["language"])
1110        if not hasattr(self, "cmap"):
1111            self.cmap = {}
1112        cmap = self.cmap
1113
1114        for element in content:
1115            if not isinstance(element, tuple):
1116                continue
1117            name, attrs, content = element
1118            if name != "map":
1119                continue
1120            cmap[safeEval(attrs["code"])] = attrs["name"]
1121
1122
1123class cmap_format_12_or_13(CmapSubtable):
1124    def __init__(self, format):
1125        self.format = format
1126        self.reserved = 0
1127        self.data = None
1128        self.ttFont = None
1129
1130    def decompileHeader(self, data, ttFont):
1131        format, reserved, length, language, nGroups = struct.unpack(">HHLLL", data[:16])
1132        assert (
1133            len(data) == (16 + nGroups * 12) == (length)
1134        ), "corrupt cmap table format %d (data length: %d, header length: %d)" % (
1135            self.format,
1136            len(data),
1137            length,
1138        )
1139        self.format = format
1140        self.reserved = reserved
1141        self.length = length
1142        self.language = language
1143        self.nGroups = nGroups
1144        self.data = data[16:]
1145        self.ttFont = ttFont
1146
1147    def decompile(self, data, ttFont):
1148        # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
1149        # If not, someone is calling the subtable decompile() directly, and must provide both args.
1150        if data is not None and ttFont is not None:
1151            self.decompileHeader(data, ttFont)
1152        else:
1153            assert (
1154                data is None and ttFont is None
1155            ), "Need both data and ttFont arguments"
1156
1157        data = (
1158            self.data
1159        )  # decompileHeader assigns the data after the header to self.data
1160        charCodes = []
1161        gids = []
1162        pos = 0
1163        for i in range(self.nGroups):
1164            startCharCode, endCharCode, glyphID = struct.unpack(
1165                ">LLL", data[pos : pos + 12]
1166            )
1167            pos += 12
1168            lenGroup = 1 + endCharCode - startCharCode
1169            charCodes.extend(list(range(startCharCode, endCharCode + 1)))
1170            gids.extend(self._computeGIDs(glyphID, lenGroup))
1171        self.data = data = None
1172        self.cmap = _make_map(self.ttFont, charCodes, gids)
1173
1174    def compile(self, ttFont):
1175        if self.data:
1176            return (
1177                struct.pack(
1178                    ">HHLLL",
1179                    self.format,
1180                    self.reserved,
1181                    self.length,
1182                    self.language,
1183                    self.nGroups,
1184                )
1185                + self.data
1186            )
1187        charCodes = list(self.cmap.keys())
1188        names = list(self.cmap.values())
1189        nameMap = ttFont.getReverseGlyphMap()
1190        try:
1191            gids = [nameMap[name] for name in names]
1192        except KeyError:
1193            nameMap = ttFont.getReverseGlyphMap(rebuild=True)
1194            try:
1195                gids = [nameMap[name] for name in names]
1196            except KeyError:
1197                # allow virtual GIDs in format 12 tables
1198                gids = []
1199                for name in names:
1200                    try:
1201                        gid = nameMap[name]
1202                    except KeyError:
1203                        try:
1204                            if name[:3] == "gid":
1205                                gid = int(name[3:])
1206                            else:
1207                                gid = ttFont.getGlyphID(name)
1208                        except:
1209                            raise KeyError(name)
1210
1211                    gids.append(gid)
1212
1213        cmap = {}  # code:glyphID mapping
1214        for code, gid in zip(charCodes, gids):
1215            cmap[code] = gid
1216
1217        charCodes.sort()
1218        index = 0
1219        startCharCode = charCodes[0]
1220        startGlyphID = cmap[startCharCode]
1221        lastGlyphID = startGlyphID - self._format_step
1222        lastCharCode = startCharCode - 1
1223        nGroups = 0
1224        dataList = []
1225        maxIndex = len(charCodes)
1226        for index in range(maxIndex):
1227            charCode = charCodes[index]
1228            glyphID = cmap[charCode]
1229            if not self._IsInSameRun(glyphID, lastGlyphID, charCode, lastCharCode):
1230                dataList.append(
1231                    struct.pack(">LLL", startCharCode, lastCharCode, startGlyphID)
1232                )
1233                startCharCode = charCode
1234                startGlyphID = glyphID
1235                nGroups = nGroups + 1
1236            lastGlyphID = glyphID
1237            lastCharCode = charCode
1238        dataList.append(struct.pack(">LLL", startCharCode, lastCharCode, startGlyphID))
1239        nGroups = nGroups + 1
1240        data = bytesjoin(dataList)
1241        lengthSubtable = len(data) + 16
1242        assert len(data) == (nGroups * 12) == (lengthSubtable - 16)
1243        return (
1244            struct.pack(
1245                ">HHLLL",
1246                self.format,
1247                self.reserved,
1248                lengthSubtable,
1249                self.language,
1250                nGroups,
1251            )
1252            + data
1253        )
1254
1255    def toXML(self, writer, ttFont):
1256        writer.begintag(
1257            self.__class__.__name__,
1258            [
1259                ("platformID", self.platformID),
1260                ("platEncID", self.platEncID),
1261                ("format", self.format),
1262                ("reserved", self.reserved),
1263                ("length", self.length),
1264                ("language", self.language),
1265                ("nGroups", self.nGroups),
1266            ],
1267        )
1268        writer.newline()
1269        codes = sorted(self.cmap.items())
1270        self._writeCodes(codes, writer)
1271        writer.endtag(self.__class__.__name__)
1272        writer.newline()
1273
1274    def fromXML(self, name, attrs, content, ttFont):
1275        self.format = safeEval(attrs["format"])
1276        self.reserved = safeEval(attrs["reserved"])
1277        self.length = safeEval(attrs["length"])
1278        self.language = safeEval(attrs["language"])
1279        self.nGroups = safeEval(attrs["nGroups"])
1280        if not hasattr(self, "cmap"):
1281            self.cmap = {}
1282        cmap = self.cmap
1283
1284        for element in content:
1285            if not isinstance(element, tuple):
1286                continue
1287            name, attrs, content = element
1288            if name != "map":
1289                continue
1290            cmap[safeEval(attrs["code"])] = attrs["name"]
1291
1292
1293class cmap_format_12(cmap_format_12_or_13):
1294    _format_step = 1
1295
1296    def __init__(self, format=12):
1297        cmap_format_12_or_13.__init__(self, format)
1298
1299    def _computeGIDs(self, startingGlyph, numberOfGlyphs):
1300        return list(range(startingGlyph, startingGlyph + numberOfGlyphs))
1301
1302    def _IsInSameRun(self, glyphID, lastGlyphID, charCode, lastCharCode):
1303        return (glyphID == 1 + lastGlyphID) and (charCode == 1 + lastCharCode)
1304
1305
1306class cmap_format_13(cmap_format_12_or_13):
1307    _format_step = 0
1308
1309    def __init__(self, format=13):
1310        cmap_format_12_or_13.__init__(self, format)
1311
1312    def _computeGIDs(self, startingGlyph, numberOfGlyphs):
1313        return [startingGlyph] * numberOfGlyphs
1314
1315    def _IsInSameRun(self, glyphID, lastGlyphID, charCode, lastCharCode):
1316        return (glyphID == lastGlyphID) and (charCode == 1 + lastCharCode)
1317
1318
1319def cvtToUVS(threeByteString):
1320    data = b"\0" + threeByteString
1321    (val,) = struct.unpack(">L", data)
1322    return val
1323
1324
1325def cvtFromUVS(val):
1326    assert 0 <= val < 0x1000000
1327    fourByteString = struct.pack(">L", val)
1328    return fourByteString[1:]
1329
1330
1331class cmap_format_14(CmapSubtable):
1332    def decompileHeader(self, data, ttFont):
1333        format, length, numVarSelectorRecords = struct.unpack(">HLL", data[:10])
1334        self.data = data[10:]
1335        self.length = length
1336        self.numVarSelectorRecords = numVarSelectorRecords
1337        self.ttFont = ttFont
1338        self.language = 0xFF  # has no language.
1339
1340    def decompile(self, data, ttFont):
1341        if data is not None and ttFont is not None:
1342            self.decompileHeader(data, ttFont)
1343        else:
1344            assert (
1345                data is None and ttFont is None
1346            ), "Need both data and ttFont arguments"
1347        data = self.data
1348
1349        self.cmap = (
1350            {}
1351        )  # so that clients that expect this to exist in a cmap table won't fail.
1352        uvsDict = {}
1353        recOffset = 0
1354        for n in range(self.numVarSelectorRecords):
1355            uvs, defOVSOffset, nonDefUVSOffset = struct.unpack(
1356                ">3sLL", data[recOffset : recOffset + 11]
1357            )
1358            recOffset += 11
1359            varUVS = cvtToUVS(uvs)
1360            if defOVSOffset:
1361                startOffset = defOVSOffset - 10
1362                (numValues,) = struct.unpack(">L", data[startOffset : startOffset + 4])
1363                startOffset += 4
1364                for r in range(numValues):
1365                    uv, addtlCnt = struct.unpack(
1366                        ">3sB", data[startOffset : startOffset + 4]
1367                    )
1368                    startOffset += 4
1369                    firstBaseUV = cvtToUVS(uv)
1370                    cnt = addtlCnt + 1
1371                    baseUVList = list(range(firstBaseUV, firstBaseUV + cnt))
1372                    glyphList = [None] * cnt
1373                    localUVList = zip(baseUVList, glyphList)
1374                    try:
1375                        uvsDict[varUVS].extend(localUVList)
1376                    except KeyError:
1377                        uvsDict[varUVS] = list(localUVList)
1378
1379            if nonDefUVSOffset:
1380                startOffset = nonDefUVSOffset - 10
1381                (numRecs,) = struct.unpack(">L", data[startOffset : startOffset + 4])
1382                startOffset += 4
1383                localUVList = []
1384                for r in range(numRecs):
1385                    uv, gid = struct.unpack(">3sH", data[startOffset : startOffset + 5])
1386                    startOffset += 5
1387                    uv = cvtToUVS(uv)
1388                    glyphName = self.ttFont.getGlyphName(gid)
1389                    localUVList.append((uv, glyphName))
1390                try:
1391                    uvsDict[varUVS].extend(localUVList)
1392                except KeyError:
1393                    uvsDict[varUVS] = localUVList
1394
1395        self.uvsDict = uvsDict
1396
1397    def toXML(self, writer, ttFont):
1398        writer.begintag(
1399            self.__class__.__name__,
1400            [
1401                ("platformID", self.platformID),
1402                ("platEncID", self.platEncID),
1403            ],
1404        )
1405        writer.newline()
1406        uvsDict = self.uvsDict
1407        uvsList = sorted(uvsDict.keys())
1408        for uvs in uvsList:
1409            uvList = uvsDict[uvs]
1410            uvList.sort(key=lambda item: (item[1] is not None, item[0], item[1]))
1411            for uv, gname in uvList:
1412                attrs = [("uv", hex(uv)), ("uvs", hex(uvs))]
1413                if gname is not None:
1414                    attrs.append(("name", gname))
1415                writer.simpletag("map", attrs)
1416                writer.newline()
1417        writer.endtag(self.__class__.__name__)
1418        writer.newline()
1419
1420    def fromXML(self, name, attrs, content, ttFont):
1421        self.language = 0xFF  # provide a value so that CmapSubtable.__lt__() won't fail
1422        if not hasattr(self, "cmap"):
1423            self.cmap = (
1424                {}
1425            )  # so that clients that expect this to exist in a cmap table won't fail.
1426        if not hasattr(self, "uvsDict"):
1427            self.uvsDict = {}
1428            uvsDict = self.uvsDict
1429
1430        # For backwards compatibility reasons we accept "None" as an indicator
1431        # for "default mapping", unless the font actually has a glyph named
1432        # "None".
1433        _hasGlyphNamedNone = None
1434
1435        for element in content:
1436            if not isinstance(element, tuple):
1437                continue
1438            name, attrs, content = element
1439            if name != "map":
1440                continue
1441            uvs = safeEval(attrs["uvs"])
1442            uv = safeEval(attrs["uv"])
1443            gname = attrs.get("name")
1444            if gname == "None":
1445                if _hasGlyphNamedNone is None:
1446                    _hasGlyphNamedNone = "None" in ttFont.getGlyphOrder()
1447                if not _hasGlyphNamedNone:
1448                    gname = None
1449            try:
1450                uvsDict[uvs].append((uv, gname))
1451            except KeyError:
1452                uvsDict[uvs] = [(uv, gname)]
1453
1454    def compile(self, ttFont):
1455        if self.data:
1456            return (
1457                struct.pack(
1458                    ">HLL", self.format, self.length, self.numVarSelectorRecords
1459                )
1460                + self.data
1461            )
1462
1463        uvsDict = self.uvsDict
1464        uvsList = sorted(uvsDict.keys())
1465        self.numVarSelectorRecords = len(uvsList)
1466        offset = (
1467            10 + self.numVarSelectorRecords * 11
1468        )  # current value is end of VarSelectorRecords block.
1469        data = []
1470        varSelectorRecords = []
1471        for uvs in uvsList:
1472            entryList = uvsDict[uvs]
1473
1474            defList = [entry for entry in entryList if entry[1] is None]
1475            if defList:
1476                defList = [entry[0] for entry in defList]
1477                defOVSOffset = offset
1478                defList.sort()
1479
1480                lastUV = defList[0]
1481                cnt = -1
1482                defRecs = []
1483                for defEntry in defList:
1484                    cnt += 1
1485                    if (lastUV + cnt) != defEntry:
1486                        rec = struct.pack(">3sB", cvtFromUVS(lastUV), cnt - 1)
1487                        lastUV = defEntry
1488                        defRecs.append(rec)
1489                        cnt = 0
1490
1491                rec = struct.pack(">3sB", cvtFromUVS(lastUV), cnt)
1492                defRecs.append(rec)
1493
1494                numDefRecs = len(defRecs)
1495                data.append(struct.pack(">L", numDefRecs))
1496                data.extend(defRecs)
1497                offset += 4 + numDefRecs * 4
1498            else:
1499                defOVSOffset = 0
1500
1501            ndefList = [entry for entry in entryList if entry[1] is not None]
1502            if ndefList:
1503                nonDefUVSOffset = offset
1504                ndefList.sort()
1505                numNonDefRecs = len(ndefList)
1506                data.append(struct.pack(">L", numNonDefRecs))
1507                offset += 4 + numNonDefRecs * 5
1508
1509                for uv, gname in ndefList:
1510                    gid = ttFont.getGlyphID(gname)
1511                    ndrec = struct.pack(">3sH", cvtFromUVS(uv), gid)
1512                    data.append(ndrec)
1513            else:
1514                nonDefUVSOffset = 0
1515
1516            vrec = struct.pack(">3sLL", cvtFromUVS(uvs), defOVSOffset, nonDefUVSOffset)
1517            varSelectorRecords.append(vrec)
1518
1519        data = bytesjoin(varSelectorRecords) + bytesjoin(data)
1520        self.length = 10 + len(data)
1521        headerdata = struct.pack(
1522            ">HLL", self.format, self.length, self.numVarSelectorRecords
1523        )
1524
1525        return headerdata + data
1526
1527
1528class cmap_format_unknown(CmapSubtable):
1529    def toXML(self, writer, ttFont):
1530        cmapName = self.__class__.__name__[:12] + str(self.format)
1531        writer.begintag(
1532            cmapName,
1533            [
1534                ("platformID", self.platformID),
1535                ("platEncID", self.platEncID),
1536            ],
1537        )
1538        writer.newline()
1539        writer.dumphex(self.data)
1540        writer.endtag(cmapName)
1541        writer.newline()
1542
1543    def fromXML(self, name, attrs, content, ttFont):
1544        self.data = readHex(content)
1545        self.cmap = {}
1546
1547    def decompileHeader(self, data, ttFont):
1548        self.language = 0  # dummy value
1549        self.data = data
1550
1551    def decompile(self, data, ttFont):
1552        # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
1553        # If not, someone is calling the subtable decompile() directly, and must provide both args.
1554        if data is not None and ttFont is not None:
1555            self.decompileHeader(data, ttFont)
1556        else:
1557            assert (
1558                data is None and ttFont is None
1559            ), "Need both data and ttFont arguments"
1560
1561    def compile(self, ttFont):
1562        if self.data:
1563            return self.data
1564        else:
1565            return None
1566
1567
1568cmap_classes = {
1569    0: cmap_format_0,
1570    2: cmap_format_2,
1571    4: cmap_format_4,
1572    6: cmap_format_6,
1573    12: cmap_format_12,
1574    13: cmap_format_13,
1575    14: cmap_format_14,
1576}
1577