1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu; 4 5 import static com.google.common.base.CharMatcher.whitespace; 6 import static com.google.common.base.Preconditions.checkArgument; 7 import static com.google.common.base.Preconditions.checkState; 8 import static com.google.common.collect.ImmutableList.toImmutableList; 9 10 import java.util.Arrays; 11 import java.util.Comparator; 12 import java.util.Objects; 13 import java.util.function.Function; 14 15 import com.google.common.base.CharMatcher; 16 import com.google.common.base.Splitter; 17 import com.google.common.collect.Comparators; 18 import com.google.common.collect.ImmutableList; 19 import com.google.common.collect.Iterables; 20 21 /** 22 * A resource bundle path, used to identify entries in ICU data. 23 * 24 * <p>Immutable and thread safe. 25 */ 26 public final class RbPath implements Comparable<RbPath> { 27 private static final Splitter PATH_SPLITTER = Splitter.on('/').trimResults(); 28 29 // This defines ordering of paths in IcuData instances and thus the order in ICU data files. 30 // If there's ever a reason to have a different "natural" order for paths, this Comparator 31 // should be moved into the ICU file writer class(es). 32 private static final Comparator<RbPath> ORDERING = 33 Comparator.comparing( 34 p -> p.segments, 35 Comparators.lexicographical(Comparator.<String>naturalOrder())); 36 37 // Matches the definition of invariant characters in "uinvchar.cpp". We can make this all much 38 // faster if needed with a custom matcher (it's just a 128 way bit lookup via 2 longs). 39 private static final CharMatcher INVARIANT_CHARS = 40 CharMatcher.ascii().and(CharMatcher.anyOf("!#$@[\\]^`{|}~").negate()); 41 42 // Note that we must also prohibit double-quote from appearing anywhere other than surrounding 43 // segment values. This is because some segment values can contain special ICU data characters 44 // (e.g. ':') but must be treated as literals. There is not proper "escaping" mechanism in ICU 45 // data for key values (since '\' is not an invariant, things like \\uxxxx are not possible). 46 // 47 // Ideally quoting would be done when the file is written, but that would require additional 48 // complexity in RbPath, since suffixes like ":intvector" must not be quoted and must somehow 49 // be distinguished from timezone "metazone" names which also contain ':'. 50 private static final CharMatcher QUOTED_SEGMENT_CHARS = 51 INVARIANT_CHARS 52 .and(CharMatcher.javaIsoControl().negate()) 53 .and(CharMatcher.isNot('"')); 54 private static final CharMatcher UNQUOTED_SEGMENT_CHARS = 55 QUOTED_SEGMENT_CHARS.and(whitespace().negate()); 56 57 /** 58 * Returns a path with the specified segments in (possibly empty). Note that unlike 59 * {@link #parse(String)}, {@code '/'} is not treated specially and can be present in a path 60 * element constructed by this method. 61 */ of(String... segments)62 public static RbPath of(String... segments) { 63 return of(Arrays.asList(segments)); 64 } 65 66 /** 67 * Returns a path with the specified segments in (possibly empty). Note that unlike 68 * {@link #parse(String)}, {@code '/'} is not treated specially and can be present in a path 69 * element constructed by this method. 70 */ of(Iterable<String> segments)71 public static RbPath of(Iterable<String> segments) { 72 return new RbPath(segments); 73 } 74 75 /** Parses the given path string, assuming {@code '/'} as a path separator. */ parse(String path)76 public static RbPath parse(String path) { 77 checkArgument(!path.isEmpty(), "cannot parse an empty path string"); 78 // Allow leading '/', but don't allow empty segments anywhere else. 79 if (path.startsWith("/")) { 80 path = path.substring(1); 81 } 82 return new RbPath(PATH_SPLITTER.split(path)); 83 } 84 85 /** Returns the common prefix length of two paths (useful when thinking of path hierarchies). */ getCommonPrefixLength(RbPath lhs, RbPath rhs)86 public static int getCommonPrefixLength(RbPath lhs, RbPath rhs) { 87 int maxLength = Math.min(lhs.length(), rhs.length()); 88 int n = 0; 89 while (n < maxLength && lhs.getSegment(n).equals(rhs.getSegment(n))) { 90 n++; 91 } 92 return n; 93 } 94 95 private final ImmutableList<String> segments; 96 private final int hashCode; 97 RbPath(Iterable<String> segments)98 private RbPath(Iterable<String> segments) { 99 this.segments = ImmutableList.copyOf(segments); 100 // Use "this.segments" since the incoming list can have a different hash! 101 this.hashCode = Objects.hash(this.segments); 102 for (String segment : this.segments) { 103 checkArgument(!segment.isEmpty(), "path segments must not be empty: %s", this.segments); 104 // Either the label is quoted (e.g. "foo") or it is bar (e.g. foo) but it can only 105 // contain double quotes at either end, or not at all. If the string is quoted, only 106 // validate the content, and not the quotes themselves. 107 switch (segment.charAt(0)) { 108 case '<': 109 // Allow anything in hidden labels, since they will be removed later and never 110 // appear in the final ICU data. 111 checkArgument(segment.endsWith(">"), 112 "mismatched quoting for hidden label: %s", segment); 113 continue; 114 115 case '"': 116 checkArgument(segment.endsWith("\""), 117 "mismatched quoting for segment: %s", segment); 118 checkArgument( 119 QUOTED_SEGMENT_CHARS.matchesAllOf(segment.substring(1, segment.length() - 1)), 120 "invalid character in unquoted resource bundle path segment: %s", segment); 121 break; 122 123 default: 124 checkArgument( 125 UNQUOTED_SEGMENT_CHARS.matchesAllOf(segment), 126 "invalid character in unquoted resource bundle path segment: %s", segment); 127 break; 128 } 129 } 130 } 131 132 /** Returns the number of segments in this path. */ length()133 public int length() { 134 return segments.size(); 135 } 136 137 /** Returns the Nth segments in this path. */ getSegment(int n)138 public String getSegment(int n) { 139 return segments.get(n); 140 } 141 142 /** Returns a new path extended at the end by the specified segments. */ extendBy(String... parts)143 public RbPath extendBy(String... parts) { 144 return new RbPath(Iterables.concat(segments, Arrays.asList(parts))); 145 } 146 147 /** Returns whether this path starts with the specified prefix. */ startsWith(RbPath prefix)148 public boolean startsWith(RbPath prefix) { 149 return prefix.length() <= length() && matchesSublist(prefix, 0); 150 } 151 152 /** Returns whether this path ends with the specified suffix. */ endsWith(RbPath suffix)153 public boolean endsWith(RbPath suffix) { 154 return suffix.length() <= length() && matchesSublist(suffix, length() - suffix.length()); 155 } 156 157 /** Returns whether this path contains the specified path. */ contains(RbPath path)158 public boolean contains(RbPath path) { 159 int maxOffset = length() - path.length(); 160 for (int i = 0; i <= maxOffset; i++) { 161 if (matchesSublist(path, i)) { 162 return true; 163 } 164 } 165 return false; 166 } 167 168 // Assume length check has been done. matchesSublist(RbPath path, int offset)169 private boolean matchesSublist(RbPath path, int offset) { 170 for (int i = 0; i < path.length(); i++) { 171 if (!path.getSegment(i).equals(getSegment(i + offset))) { 172 return false; 173 } 174 } 175 return true; 176 } 177 178 // TODO: Remove this and isAnonymous() since they are only called once each, in the same place. getParent()179 public RbPath getParent() { 180 checkState(length() > 0, "cannot get parent of the empty path"); 181 return new RbPath(segments.subList(0, length() - 1)); 182 } 183 isAnonymous()184 public boolean isAnonymous() { 185 return length() > 0 && segments.get(length() - 1).charAt(0) == '<'; 186 } 187 188 // TODO: Remove this special case code (called exactly once). mapSegments(Function<? super String, String> fn)189 public RbPath mapSegments(Function<? super String, String> fn) { 190 return new RbPath(segments.stream().map(fn).collect(toImmutableList())); 191 } 192 193 // TODO: Remove this and isAlias() in favour of having properly typed paths. isIntPath()194 public boolean isIntPath() { 195 return typeSuffixIsAnyOf(":int", ":intvector"); 196 } 197 isBinPath()198 public boolean isBinPath() { 199 return typeSuffixIsAnyOf(":bin"); 200 } 201 isAlias()202 public boolean isAlias() { 203 return typeSuffixIsAnyOf(":alias"); 204 } 205 typeSuffixIsAnyOf(String... types)206 private boolean typeSuffixIsAnyOf(String... types) { 207 String lastElement = getSegment(length() - 1); 208 for (String type : types) { 209 if (lastElement.endsWith(type)) { 210 return true; 211 } 212 } 213 return false; 214 } 215 compareTo(RbPath other)216 @Override public int compareTo(RbPath other) { 217 return ORDERING.compare(this, other); 218 } 219 equals(Object other)220 @Override public boolean equals(Object other) { 221 return (other instanceof RbPath) && segments.equals(((RbPath) other).segments); 222 } 223 hashCode()224 @Override public int hashCode() { 225 return hashCode; 226 } 227 toString()228 @Override public String toString() { 229 return String.join("/", segments); 230 } 231 } 232