1 package org.jsoup.nodes; 2 3 import com.google.gson.Gson; 4 import com.google.gson.reflect.TypeToken; 5 import org.jsoup.Connection; 6 import org.jsoup.Jsoup; 7 import org.jsoup.integration.UrlConnectTest; 8 9 import java.io.File; 10 import java.io.FileWriter; 11 import java.io.IOException; 12 import java.nio.file.Files; 13 import java.util.ArrayList; 14 import java.util.Comparator; 15 import java.util.Map; 16 17 /** 18 * Fetches HTML entity names from w3.org json, and outputs data files for optimized used in Entities. 19 * I refuse to believe that entity names like "NotNestedLessLess" are valuable or useful for HTML authors. Implemented 20 * only to be complete. 21 */ 22 class BuildEntities { main(String[] args)23 public static void main(String[] args) throws IOException { 24 String url = "https://www.w3.org/TR/2012/WD-html5-20121025/entities.json"; 25 Connection.Response res = Jsoup.connect(url) 26 .ignoreContentType(true) 27 .userAgent(UrlConnectTest.browserUa) 28 .execute(); 29 30 Gson gson = new Gson(); 31 Map<String, CharacterRef> input = gson.fromJson(res.body(), 32 new TypeToken<Map<String, CharacterRef>>() { 33 }.getType()); 34 35 36 // build name sorted base and full character lists: 37 ArrayList<CharacterRef> base = new ArrayList<>(); 38 ArrayList<CharacterRef> full = new ArrayList<>(); 39 40 for (Map.Entry<String, CharacterRef> entry : input.entrySet()) { 41 String name = entry.getKey().substring(1); // name is like ´ or ´ , trim & 42 CharacterRef ref = entry.getValue(); 43 if (name.endsWith(";")) { 44 name = name.substring(0, name.length() - 1); 45 full.add(ref); 46 } else { 47 base.add(ref); 48 } 49 ref.name = name; 50 } 51 base.sort(byName); 52 full.sort(byName); 53 54 // now determine code point order 55 ArrayList<CharacterRef> baseByCode = new ArrayList<>(base); 56 ArrayList<CharacterRef> fullByCode = new ArrayList<>(full); 57 baseByCode.sort(byCode); 58 fullByCode.sort(byCode); 59 60 // and update their codepoint index. 61 @SuppressWarnings("unchecked") ArrayList<CharacterRef>[] codelists = new ArrayList[]{baseByCode, fullByCode}; 62 for (ArrayList<CharacterRef> codelist : codelists) { 63 for (int i = 0; i < codelist.size(); i++) { 64 codelist.get(i).codeIndex = i; 65 } 66 } 67 68 // now write them 69 persist("entities-full", full); 70 persist("entities-base", base); 71 72 System.out.println("Full size: " + full.size() + ", base size: " + base.size()); 73 } 74 persist(String name, ArrayList<CharacterRef> refs)75 private static void persist(String name, ArrayList<CharacterRef> refs) throws IOException { 76 File file = Files.createTempFile(name, ".txt").toFile(); 77 FileWriter writer = new FileWriter(file, false); 78 writer.append("static final String points = \""); 79 for (CharacterRef ref : refs) { 80 writer.append(ref.toString()).append('&'); 81 } 82 writer.append("\";\n"); 83 writer.close(); 84 85 System.out.println("Wrote " + name + " to " + file.getAbsolutePath()); 86 } 87 88 89 private static class CharacterRef { 90 int[] codepoints; 91 String name; 92 int codeIndex; 93 94 @Override toString()95 public String toString() { 96 return name 97 + "=" 98 + d(codepoints[0]) 99 + (codepoints.length > 1 ? "," + d(codepoints[1]) : "") 100 + ";" + d(codeIndex); 101 } 102 } 103 d(int d)104 private static String d(int d) { 105 return Integer.toString(d, Entities.codepointRadix); 106 } 107 108 private static class ByName implements Comparator<CharacterRef> { compare(CharacterRef o1, CharacterRef o2)109 public int compare(CharacterRef o1, CharacterRef o2) { 110 return o1.name.compareTo(o2.name); 111 } 112 } 113 114 private static class ByCode implements Comparator<CharacterRef> { compare(CharacterRef o1, CharacterRef o2)115 public int compare(CharacterRef o1, CharacterRef o2) { 116 int[] c1 = o1.codepoints; 117 int[] c2 = o2.codepoints; 118 int first = c1[0] - c2[0]; 119 if (first != 0) 120 return first; 121 if (c1.length == 1 && c2.length == 1) { // for the same code, use the shorter name 122 int len = o2.name.length() - o1.name.length(); 123 if (len != 0) 124 return len; 125 return o1.name.compareTo(o2.name); 126 } 127 if (c1.length == 2 && c2.length == 2) 128 return c1[1] - c2[1]; 129 else 130 return c2.length - c1.length; // pushes multi down the list so hits on singles first (don't support multi lookup by codepoint yet) 131 } 132 } 133 134 private static ByName byName = new ByName(); 135 private static ByCode byCode = new ByCode(); 136 } 137