xref: /aosp_15_r20/external/jsoup/src/test/java/org/jsoup/nodes/BuildEntities.java (revision 6da8f8c4bc310ad659121b84dd089062417a2ce2)
1 package org.jsoup.nodes;
2 
3 import com.google.gson.Gson;
4 import com.google.gson.reflect.TypeToken;
5 import org.jsoup.Connection;
6 import org.jsoup.Jsoup;
7 import org.jsoup.integration.UrlConnectTest;
8 
9 import java.io.File;
10 import java.io.FileWriter;
11 import java.io.IOException;
12 import java.nio.file.Files;
13 import java.util.ArrayList;
14 import java.util.Comparator;
15 import java.util.Map;
16 
17 /**
18  * Fetches HTML entity names from w3.org json, and outputs data files for optimized used in Entities.
19  * I refuse to believe that entity names like "NotNestedLessLess" are valuable or useful for HTML authors. Implemented
20  * only to be complete.
21  */
22 class BuildEntities {
main(String[] args)23     public static void main(String[] args) throws IOException {
24         String url = "https://www.w3.org/TR/2012/WD-html5-20121025/entities.json";
25         Connection.Response res = Jsoup.connect(url)
26             .ignoreContentType(true)
27             .userAgent(UrlConnectTest.browserUa)
28             .execute();
29 
30         Gson gson = new Gson();
31         Map<String, CharacterRef> input = gson.fromJson(res.body(),
32             new TypeToken<Map<String, CharacterRef>>() {
33             }.getType());
34 
35 
36         // build name sorted base and full character lists:
37         ArrayList<CharacterRef> base = new ArrayList<>();
38         ArrayList<CharacterRef> full = new ArrayList<>();
39 
40         for (Map.Entry<String, CharacterRef> entry : input.entrySet()) {
41             String name = entry.getKey().substring(1); // name is like &acute or &acute; , trim &
42             CharacterRef ref = entry.getValue();
43             if (name.endsWith(";")) {
44                 name = name.substring(0, name.length() - 1);
45                 full.add(ref);
46             } else {
47                 base.add(ref);
48             }
49             ref.name = name;
50         }
51         base.sort(byName);
52         full.sort(byName);
53 
54         // now determine code point order
55         ArrayList<CharacterRef> baseByCode = new ArrayList<>(base);
56         ArrayList<CharacterRef> fullByCode = new ArrayList<>(full);
57         baseByCode.sort(byCode);
58         fullByCode.sort(byCode);
59 
60         // and update their codepoint index.
61         @SuppressWarnings("unchecked") ArrayList<CharacterRef>[] codelists = new ArrayList[]{baseByCode, fullByCode};
62         for (ArrayList<CharacterRef> codelist : codelists) {
63             for (int i = 0; i < codelist.size(); i++) {
64                 codelist.get(i).codeIndex = i;
65             }
66         }
67 
68         // now write them
69         persist("entities-full", full);
70         persist("entities-base", base);
71 
72         System.out.println("Full size: " + full.size() + ", base size: " + base.size());
73     }
74 
persist(String name, ArrayList<CharacterRef> refs)75     private static void persist(String name, ArrayList<CharacterRef> refs) throws IOException {
76         File file = Files.createTempFile(name, ".txt").toFile();
77         FileWriter writer = new FileWriter(file, false);
78         writer.append("static final String points = \"");
79         for (CharacterRef ref : refs) {
80             writer.append(ref.toString()).append('&');
81         }
82         writer.append("\";\n");
83         writer.close();
84 
85         System.out.println("Wrote " + name + " to " + file.getAbsolutePath());
86     }
87 
88 
89     private static class CharacterRef {
90         int[] codepoints;
91         String name;
92         int codeIndex;
93 
94         @Override
toString()95         public String toString() {
96             return name
97                 + "="
98                 + d(codepoints[0])
99                 + (codepoints.length > 1 ? "," + d(codepoints[1]) : "")
100                 + ";" + d(codeIndex);
101         }
102     }
103 
d(int d)104     private static String d(int d) {
105         return Integer.toString(d, Entities.codepointRadix);
106     }
107 
108     private static class ByName implements Comparator<CharacterRef> {
compare(CharacterRef o1, CharacterRef o2)109         public int compare(CharacterRef o1, CharacterRef o2) {
110             return o1.name.compareTo(o2.name);
111         }
112     }
113 
114     private static class ByCode implements Comparator<CharacterRef> {
compare(CharacterRef o1, CharacterRef o2)115         public int compare(CharacterRef o1, CharacterRef o2) {
116             int[] c1 = o1.codepoints;
117             int[] c2 = o2.codepoints;
118             int first = c1[0] - c2[0];
119             if (first != 0)
120                 return first;
121             if (c1.length == 1 && c2.length == 1) { // for the same code, use the shorter name
122                 int len = o2.name.length() - o1.name.length();
123                 if (len != 0)
124                     return len;
125                 return o1.name.compareTo(o2.name);
126             }
127             if (c1.length == 2 && c2.length == 2)
128                 return c1[1] - c2[1];
129             else
130                 return c2.length - c1.length; // pushes multi down the list so hits on singles first (don't support multi lookup by codepoint yet)
131         }
132     }
133 
134     private static ByName byName = new ByName();
135     private static ByCode byCode = new ByCode();
136 }
137