1 package org.unicode.cldr.tool; 2 3 import com.ibm.icu.impl.Relation; 4 import java.io.BufferedReader; 5 import java.io.IOException; 6 import java.util.ArrayList; 7 import java.util.Collection; 8 import java.util.LinkedHashSet; 9 import java.util.List; 10 import java.util.Set; 11 import java.util.TreeMap; 12 import org.unicode.cldr.draft.FileUtilities; 13 14 /** Take mappings to IPA and interleave them. */ 15 public class MatchStrings { 16 17 static String cldrDataDir = 18 "C:\\cvsdata\\unicode\\cldr\\tools\\java\\org\\unicode\\cldr\\util\\data\\transforms\\"; 19 20 static class Info { 21 String english; 22 23 String ipa; 24 25 String fixedIpa; 26 Info(String english, String ipa, String fixedIpa)27 public Info(String english, String ipa, String fixedIpa) { 28 this.english = english; 29 this.ipa = ipa; 30 this.fixedIpa = fixedIpa.equals(ipa) ? ipa : fixedIpa; // make == 31 } 32 33 @Override toString()34 public String toString() { 35 return "{" + english + "/" + ipa + (fixedIpa == ipa ? "" : "/" + fixedIpa) + "}"; 36 } 37 } 38 39 Relation<String, Info> letter_correspondances = 40 Relation.of(new TreeMap<String, Set<Info>>(), LinkedHashSet.class); 41 MatchStrings()42 MatchStrings() throws IOException { 43 BufferedReader in = FileUtilities.openUTF8Reader(cldrDataDir, "internal_matchIpaRules.txt"); 44 while (true) { 45 String line = in.readLine(); 46 if (line == null) break; 47 if (line.length() == 0) continue; 48 String[] parts = line.split("\\s+"); 49 String ipa = parts.length > 1 ? parts[1] : ""; 50 add(parts[0], ipa, parts.length > 2 ? parts[2] : ipa); 51 } 52 in.close(); 53 } 54 add(String english, String ipa, String fixedIpa)55 void add(String english, String ipa, String fixedIpa) { 56 String key = english.length() == 0 ? "" : english.substring(0, 1); 57 letter_correspondances.put(key, new Info(english, ipa, fixedIpa)); 58 } 59 60 /** 61 * Insert the IPA in after the string, such as baitt + /bet/ => b{b}ai{e}t{t} 62 * 63 * @param english 64 * @param ipa 65 * @return 66 */ interleaveIPA(String english, String ipa, List<Info> output)67 int interleaveIPA(String english, String ipa, List<Info> output) { 68 highWater = 0; 69 longestEnglish = 0; 70 longestIpa = 0; 71 highWaterList.clear(); 72 this.english = english; 73 this.ipa = ipa; 74 this.output = output; 75 output.clear(); 76 return interleave2(0, 0); 77 } 78 79 String english; 80 String ipa; 81 List<Info> output; 82 int highWater = 0; 83 List<Info> highWaterList = new ArrayList<>(); 84 private int longestEnglish; 85 private int longestIpa; 86 87 /** 88 * Recursively match the string. Right now, we just take the matches in order; later we could 89 * try a weighted fit 90 * 91 * @param english 92 * @param englishPosition 93 * @param ipa 94 * @param ipaPosition 95 * @param path2values 96 * @return 97 */ interleave2(int englishPosition, int ipaPosition)98 private int interleave2(int englishPosition, int ipaPosition) { 99 100 if (highWater < ipaPosition) { 101 highWaterList.clear(); 102 highWaterList.addAll(output); 103 highWater = output.size(); 104 longestEnglish = englishPosition; 105 longestIpa = ipaPosition; 106 } 107 if (englishPosition == english.length()) { 108 if (ipaPosition == ipa.length()) { 109 return 1; 110 } 111 return 0; 112 } 113 String firstLetter = english.substring(englishPosition, englishPosition + 1); 114 Set<Info> possibilities = letter_correspondances.getAll(firstLetter); 115 if (possibilities != null) { 116 int result = checkPossibilities(possibilities, englishPosition, ipaPosition); 117 if (result != 0) { 118 return result; 119 } 120 } 121 122 // we failed, try the empty string 123 possibilities = letter_correspondances.getAll(""); 124 if (possibilities != null) { 125 int result = checkPossibilities(possibilities, englishPosition, ipaPosition); 126 if (result != 0) { 127 return result; 128 } 129 } 130 131 // failed, 132 133 // we failed to find a pair. Make last check to see if we just 134 // delete one English letter 135 Info last = output.size() == 0 ? null : output.get(output.size() - 1); 136 if (last == null || last.ipa.length() != 0) { 137 output.add(new Info(firstLetter, "", "")); 138 int result = interleave2(englishPosition + 1, ipaPosition); 139 if (result == 1) { 140 return 1; 141 } 142 // if we fail, then remove the pair, and continue 143 output.remove(output.size() - 1); 144 } 145 146 // if we get this far, we've exhausted the possibilities, so fail 147 return 0; 148 } 149 checkPossibilities(Collection<Info> possibilities, int englishPosition, int ipaPosition)150 int checkPossibilities(Collection<Info> possibilities, int englishPosition, int ipaPosition) { 151 for (Info englishIpa : possibilities) { 152 // skip if we don't match 153 String englishPart = englishIpa.english; 154 String ipaPart = englishIpa.ipa; 155 if (!english.regionMatches(englishPosition, englishPart, 0, englishPart.length())) { 156 continue; 157 } 158 // boolean ipaMatches = ipa.regionMatches(ipaPosition, ipaPart, 0, ipaPart.length()); 159 // boolean ipa2Matches = matchAtIgnoring(ipaPosition, ipaPart); 160 // if (ipaMatches != ipa2Matches) { 161 // System.out.println("Fails " + ipa.substring(ipaPosition) + ", " + ipaPart); 162 // } 163 int matchesUpTo = matchAtIgnoring(ipaPosition, ipaPart); 164 if (matchesUpTo < 0) { 165 continue; 166 } 167 // we match, so recurse 168 output.add(englishIpa); 169 int result = interleave2(englishPosition + englishPart.length(), matchesUpTo); 170 if (result == 1) { 171 return 1; 172 } 173 // if we fail, then remove the pair, and continue 174 output.remove(output.size() - 1); 175 } 176 return 0; 177 } 178 179 /** 180 * Does ipaPart match ipa at the position, ignoring stress marks in ipa? Returns how far it got. 181 * 182 * @param ipaPosition 183 * @param ipaPart 184 * @return 185 */ matchAtIgnoring(int ipaPosition, String ipaPart)186 private int matchAtIgnoring(int ipaPosition, String ipaPart) { 187 if (ipaPart.length() == 0) return ipaPosition; 188 int j = 0; 189 for (int i = ipaPosition; i < ipa.length(); ++i) { 190 char ch = ipa.charAt(i); 191 if (ch == 'ˈ' || ch == 'ˌ') continue; 192 char ch2 = ipaPart.charAt(j++); 193 if (ch != ch2) return -1; 194 if (j >= ipaPart.length()) return i + 1; 195 } 196 return -1; 197 } 198 199 List<Info> current = new ArrayList<>(); 200 201 /** 202 * Fix the IPA in a string 203 * 204 * @param english 205 * @param ipa 206 * @return 207 */ fixIPA(String english, String ipa)208 String fixIPA(String english, String ipa) { 209 int result = interleaveIPA(english, ipa, current); 210 if (result == 0) return null; 211 StringBuilder buffer = new StringBuilder(); 212 for (Info englishIpa : current) { 213 buffer.append(englishIpa.fixedIpa); 214 } 215 return buffer.toString(); 216 } 217 getTrace()218 String getTrace() { 219 return highWaterList.toString() 220 + "\t\t" 221 + english.substring(longestEnglish) 222 + "\t≠\t" 223 + ipa.substring(longestIpa); 224 } 225 } 226