1*912701f9SAndroid Build Coastguard Worker#!/usr/bin/python 2*912701f9SAndroid Build Coastguard Worker# -*- coding: utf-8 -*- 3*912701f9SAndroid Build Coastguard Worker# 4*912701f9SAndroid Build Coastguard Worker# created on: 2013jun05 5*912701f9SAndroid Build Coastguard Worker# created by: Markus W. Scherer 6*912701f9SAndroid Build Coastguard Worker 7*912701f9SAndroid Build Coastguard Worker"""Converts CLDR collation files from XML syntax to ICU syntax. 8*912701f9SAndroid Build Coastguard Worker 9*912701f9SAndroid Build Coastguard WorkerHandles the CLDR collation data in the post-CLDR 23 trunk in 2013 June. 10*912701f9SAndroid Build Coastguard WorkerPreserves indentation (except where it joins lines) and text vs. NCR etc. 11*912701f9SAndroid Build Coastguard WorkerDoes not handle arbitrary LDML XML collation syntax.""" 12*912701f9SAndroid Build Coastguard Worker 13*912701f9SAndroid Build Coastguard Worker# Invoke with two arguments: 14*912701f9SAndroid Build Coastguard Worker# - the source folder path 15*912701f9SAndroid Build Coastguard Worker# - the destination folder path 16*912701f9SAndroid Build Coastguard Worker# For example: 17*912701f9SAndroid Build Coastguard Worker# ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation 18*912701f9SAndroid Build Coastguard Worker 19*912701f9SAndroid Build Coastguard Workerimport codecs 20*912701f9SAndroid Build Coastguard Workerimport glob 21*912701f9SAndroid Build Coastguard Workerimport os.path 22*912701f9SAndroid Build Coastguard Workerimport sys 23*912701f9SAndroid Build Coastguard Worker 24*912701f9SAndroid Build Coastguard Workerdef GetIndent(s): 25*912701f9SAndroid Build Coastguard Worker for i in range(len(s)): 26*912701f9SAndroid Build Coastguard Worker if s[i] not in " \t": return s[:i] 27*912701f9SAndroid Build Coastguard Worker return s 28*912701f9SAndroid Build Coastguard Worker 29*912701f9SAndroid Build Coastguard Worker 30*912701f9SAndroid Build Coastguard Worker# substring replacements 31*912701f9SAndroid Build Coastguard Workerreplacements = ( 32*912701f9SAndroid Build Coastguard Worker # White space and syntax characters must be quoted. 33*912701f9SAndroid Build Coastguard Worker # Using '\\u0020' rather than just ' ' for clarity. 34*912701f9SAndroid Build Coastguard Worker ("<reset> </reset>", "&'\\u0020'"), # can't just replace all "> <" 35*912701f9SAndroid Build Coastguard Worker (">!<", ">'!'<"), 36*912701f9SAndroid Build Coastguard Worker ('>"<', ">'\\\"'<"), 37*912701f9SAndroid Build Coastguard Worker (">"<", ">'\\\"'<"), 38*912701f9SAndroid Build Coastguard Worker (">#<", ">'\\u0023'<"), 39*912701f9SAndroid Build Coastguard Worker (">$<", ">'$'<"), 40*912701f9SAndroid Build Coastguard Worker (">%<", ">'%'<"), 41*912701f9SAndroid Build Coastguard Worker (">&<", ">'&'<"), 42*912701f9SAndroid Build Coastguard Worker (">&<", ">'&'<"), 43*912701f9SAndroid Build Coastguard Worker (">'<", ">''<"), 44*912701f9SAndroid Build Coastguard Worker (">'<", ">''<"), 45*912701f9SAndroid Build Coastguard Worker (">(<", ">'('<"), 46*912701f9SAndroid Build Coastguard Worker (">)<", ">')'<"), 47*912701f9SAndroid Build Coastguard Worker (">*<", ">'*'<"), 48*912701f9SAndroid Build Coastguard Worker (">+<", ">'+'<"), 49*912701f9SAndroid Build Coastguard Worker (">,<", ">','<"), 50*912701f9SAndroid Build Coastguard Worker (">-<", ">'-'<"), 51*912701f9SAndroid Build Coastguard Worker (">.<", ">'.'<"), 52*912701f9SAndroid Build Coastguard Worker (">/<", ">'/'<"), 53*912701f9SAndroid Build Coastguard Worker (">:<", ">':'<"), 54*912701f9SAndroid Build Coastguard Worker (">;<", ">';'<"), 55*912701f9SAndroid Build Coastguard Worker ("><<", ">'<'<"), 56*912701f9SAndroid Build Coastguard Worker (">=<", ">'='<"), 57*912701f9SAndroid Build Coastguard Worker (">><", ">'>'<"), 58*912701f9SAndroid Build Coastguard Worker (">?<", ">'?'<"), 59*912701f9SAndroid Build Coastguard Worker (">@<", ">'@'<"), 60*912701f9SAndroid Build Coastguard Worker (">[<", ">'['<"), 61*912701f9SAndroid Build Coastguard Worker (">\\<", ">'\\\\'<"), 62*912701f9SAndroid Build Coastguard Worker (">]<", ">']'<"), 63*912701f9SAndroid Build Coastguard Worker (">^<", ">'^'<"), 64*912701f9SAndroid Build Coastguard Worker (">_<", ">'_'<"), 65*912701f9SAndroid Build Coastguard Worker (">`<", ">'`'<"), 66*912701f9SAndroid Build Coastguard Worker (">{<", ">'{'<"), 67*912701f9SAndroid Build Coastguard Worker (">|<", ">'|'<"), 68*912701f9SAndroid Build Coastguard Worker (">}<", ">'}'<"), 69*912701f9SAndroid Build Coastguard Worker (">~<", ">'~'<"), 70*912701f9SAndroid Build Coastguard Worker # ha.xml has the following 71*912701f9SAndroid Build Coastguard Worker ("'y", "''y"), 72*912701f9SAndroid Build Coastguard Worker ("'Y", "''Y"), 73*912701f9SAndroid Build Coastguard Worker # kl.xml has the following 74*912701f9SAndroid Build Coastguard Worker ("K'", "K''"), 75*912701f9SAndroid Build Coastguard Worker # not Pattern_White_Space, just obscure 76*912701f9SAndroid Build Coastguard Worker (u"\u00A0", u"\\u00A0"), 77*912701f9SAndroid Build Coastguard Worker (u"\u200C", u"\\u200C"), 78*912701f9SAndroid Build Coastguard Worker (u"\u200D", u"\\u200D"), 79*912701f9SAndroid Build Coastguard Worker (u"\u3000", u"\\u3000"), 80*912701f9SAndroid Build Coastguard Worker # obscure, and some tools do not handle noncharacters well 81*912701f9SAndroid Build Coastguard Worker (u"\uFDD0", u"'\\uFDD0'"), 82*912701f9SAndroid Build Coastguard Worker # The old ICU collation rule parser seems to need more escaping than it should. 83*912701f9SAndroid Build Coastguard Worker (u"≠", u"'≠'"), 84*912701f9SAndroid Build Coastguard Worker # fi.xml resets contain a space 85*912701f9SAndroid Build Coastguard Worker (u" ̵</reset>", u"'\\u0020'̵"), 86*912701f9SAndroid Build Coastguard Worker # fa.xml <sc> with non-NFD_Inert chars 87*912701f9SAndroid Build Coastguard Worker (u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"), 88*912701f9SAndroid Build Coastguard Worker # ml.xml strings contain spaces 89*912701f9SAndroid Build Coastguard Worker (u" </s>", u"'\\u0020'"), 90*912701f9SAndroid Build Coastguard Worker (u" </reset>", u"'\\u0020'"), 91*912701f9SAndroid Build Coastguard Worker # vi.xml <sc> with non-NFD_Inert chars 92*912701f9SAndroid Build Coastguard Worker (u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"), 93*912701f9SAndroid Build Coastguard Worker # en_US_POSIX needs a lot of quoting. 94*912701f9SAndroid Build Coastguard Worker ("<pc> !"#$%&'()*+,-./</pc>", "<*'\\u0020'-'/'"), 95*912701f9SAndroid Build Coastguard Worker ("<pc>0123456789:;<=>?@</pc>", "<*0-'@'"), 96*912701f9SAndroid Build Coastguard Worker ("<pc>[\]^_`</pc>", "<*'['-'`'"), 97*912701f9SAndroid Build Coastguard Worker ("<pc>{|}~</pc>", "<*'{'-'\u007F'"), 98*912701f9SAndroid Build Coastguard Worker # CJK parenthesized resets 99*912701f9SAndroid Build Coastguard Worker ("<reset>(", "&'('"), 100*912701f9SAndroid Build Coastguard Worker (")</reset>", "')'"), 101*912701f9SAndroid Build Coastguard Worker # Convert XML elements into ICU syntax. 102*912701f9SAndroid Build Coastguard Worker ("><!--", "> #"), # add a space before an inline comment 103*912701f9SAndroid Build Coastguard Worker ("<!--", "#"), 104*912701f9SAndroid Build Coastguard Worker (" -->", ""), 105*912701f9SAndroid Build Coastguard Worker ("-->", ""), 106*912701f9SAndroid Build Coastguard Worker ("<reset>", "&"), 107*912701f9SAndroid Build Coastguard Worker ('<reset before="primary">', "&[before 1]"), 108*912701f9SAndroid Build Coastguard Worker ('<reset before="secondary">', "&[before 2]"), 109*912701f9SAndroid Build Coastguard Worker ('<reset before="tertiary">', "&[before 3]"), 110*912701f9SAndroid Build Coastguard Worker ("</reset>", ""), 111*912701f9SAndroid Build Coastguard Worker ("<p>", "<"), 112*912701f9SAndroid Build Coastguard Worker ("</p>", ""), 113*912701f9SAndroid Build Coastguard Worker ("<s>", "<<"), 114*912701f9SAndroid Build Coastguard Worker ("</s>", ""), 115*912701f9SAndroid Build Coastguard Worker ("<t>", "<<<"), 116*912701f9SAndroid Build Coastguard Worker ("</t>", ""), 117*912701f9SAndroid Build Coastguard Worker ("<i>", "="), 118*912701f9SAndroid Build Coastguard Worker ("</i>", ""), 119*912701f9SAndroid Build Coastguard Worker ("<pc>", "<*"), 120*912701f9SAndroid Build Coastguard Worker ("</pc>", ""), 121*912701f9SAndroid Build Coastguard Worker ("<sc>", "<<*"), 122*912701f9SAndroid Build Coastguard Worker ("</sc>", ""), 123*912701f9SAndroid Build Coastguard Worker ("<tc>", "<<<*"), 124*912701f9SAndroid Build Coastguard Worker ("</tc>", ""), 125*912701f9SAndroid Build Coastguard Worker ("<ic>", "=*"), 126*912701f9SAndroid Build Coastguard Worker ("</ic>", ""), 127*912701f9SAndroid Build Coastguard Worker ("<x>", ""), 128*912701f9SAndroid Build Coastguard Worker ("</x>", ""), 129*912701f9SAndroid Build Coastguard Worker ("<extend>", "/"), 130*912701f9SAndroid Build Coastguard Worker ("</extend>", ""), 131*912701f9SAndroid Build Coastguard Worker ("</context>", "|"), 132*912701f9SAndroid Build Coastguard Worker ("<first_tertiary_ignorable/>", "[first tertiary ignorable]"), 133*912701f9SAndroid Build Coastguard Worker ("<last_tertiary_ignorable/>", "[last tertiary ignorable]"), 134*912701f9SAndroid Build Coastguard Worker ("<first_secondary_ignorable/>", "[first secondary ignorable]"), 135*912701f9SAndroid Build Coastguard Worker ("<last_secondary_ignorable/>", "[last secondary ignorable]"), 136*912701f9SAndroid Build Coastguard Worker ("<first_primary_ignorable/>", "[first primary ignorable]"), 137*912701f9SAndroid Build Coastguard Worker ("<last_primary_ignorable/>", "[last primary ignorable]"), 138*912701f9SAndroid Build Coastguard Worker ("<first_variable/>", "[first variable]"), 139*912701f9SAndroid Build Coastguard Worker ("<last_variable/>", "[last variable]"), 140*912701f9SAndroid Build Coastguard Worker ("<first_non_ignorable/>", "[first regular]"), 141*912701f9SAndroid Build Coastguard Worker ("<last_non_ignorable/>", "[last regular]"), 142*912701f9SAndroid Build Coastguard Worker ("<last_non_ignorable />", "[last regular]"), 143*912701f9SAndroid Build Coastguard Worker ("<first_trailing/>", "[first trailing]"), 144*912701f9SAndroid Build Coastguard Worker ("<last_trailing/>", "[last trailing]") 145*912701f9SAndroid Build Coastguard Worker) 146*912701f9SAndroid Build Coastguard Worker 147*912701f9SAndroid Build Coastguard Worker 148*912701f9SAndroid Build Coastguard Workerdef ConvertFile(src, dest): 149*912701f9SAndroid Build Coastguard Worker in_rules = False 150*912701f9SAndroid Build Coastguard Worker partial = "" 151*912701f9SAndroid Build Coastguard Worker in_ml_comment = False 152*912701f9SAndroid Build Coastguard Worker for line in src: 153*912701f9SAndroid Build Coastguard Worker if "<rules>" in line: 154*912701f9SAndroid Build Coastguard Worker indent = GetIndent(line) 155*912701f9SAndroid Build Coastguard Worker stripped = line.strip() 156*912701f9SAndroid Build Coastguard Worker # Replace import-only rules with import elements. 157*912701f9SAndroid Build Coastguard Worker if stripped == '<rules><import source="sr"/></rules>': 158*912701f9SAndroid Build Coastguard Worker dest.write(indent + '<import source="sr"/>\n') 159*912701f9SAndroid Build Coastguard Worker elif stripped == '<rules><import source="hr" type="search"/></rules>': 160*912701f9SAndroid Build Coastguard Worker dest.write(indent + '<import source="hr" type="search"/>\n') 161*912701f9SAndroid Build Coastguard Worker elif stripped == '<rules><import source="hr"/></rules>': 162*912701f9SAndroid Build Coastguard Worker dest.write(indent + '<import source="hr"/>\n') 163*912701f9SAndroid Build Coastguard Worker elif stripped == '<rules><import source="ps"/></rules>': 164*912701f9SAndroid Build Coastguard Worker dest.write(indent + '<import source="ps"/>\n') 165*912701f9SAndroid Build Coastguard Worker else: 166*912701f9SAndroid Build Coastguard Worker # Replace the XML <rules> section with ICU syntax rules in <cr>. 167*912701f9SAndroid Build Coastguard Worker assert stripped == "<rules>" 168*912701f9SAndroid Build Coastguard Worker dest.write(indent + "<cr><![CDATA[\n") 169*912701f9SAndroid Build Coastguard Worker in_rules = True 170*912701f9SAndroid Build Coastguard Worker elif "</rules>" in line: 171*912701f9SAndroid Build Coastguard Worker # Flush, and go back to just copying lines until the next <rules>. 172*912701f9SAndroid Build Coastguard Worker if partial: 173*912701f9SAndroid Build Coastguard Worker dest.write(partial + "\n") 174*912701f9SAndroid Build Coastguard Worker partial = "" 175*912701f9SAndroid Build Coastguard Worker in_ml_comment = False 176*912701f9SAndroid Build Coastguard Worker dest.write(GetIndent(line) + "]]></cr>\n") 177*912701f9SAndroid Build Coastguard Worker in_rules = False 178*912701f9SAndroid Build Coastguard Worker else: 179*912701f9SAndroid Build Coastguard Worker if in_rules: 180*912701f9SAndroid Build Coastguard Worker # Find out whether we want to concatenate the current line 181*912701f9SAndroid Build Coastguard Worker # with the previous and/or next one. 182*912701f9SAndroid Build Coastguard Worker finish_partial = False # Finish collected, partial input. 183*912701f9SAndroid Build Coastguard Worker start_ml_comment = False # Start of a multi-line comment. 184*912701f9SAndroid Build Coastguard Worker stop_comment = False # End of a comment, must terminate the line. 185*912701f9SAndroid Build Coastguard Worker if ("<reset" in line) or line.lstrip().startswith("<!--"): 186*912701f9SAndroid Build Coastguard Worker finish_partial = True 187*912701f9SAndroid Build Coastguard Worker if partial and len(partial.strip()) > 80: 188*912701f9SAndroid Build Coastguard Worker finish_partial = True 189*912701f9SAndroid Build Coastguard Worker if "<!--" in line and "-->" not in line: 190*912701f9SAndroid Build Coastguard Worker start_ml_comment = True 191*912701f9SAndroid Build Coastguard Worker if "-->" in line: 192*912701f9SAndroid Build Coastguard Worker assert line.rstrip().endswith("-->") 193*912701f9SAndroid Build Coastguard Worker stop_comment = True 194*912701f9SAndroid Build Coastguard Worker 195*912701f9SAndroid Build Coastguard Worker # Convert XML syntax to ICU syntax. 196*912701f9SAndroid Build Coastguard Worker if "<context>" in line: 197*912701f9SAndroid Build Coastguard Worker # Swap context & relation: 198*912701f9SAndroid Build Coastguard Worker # <x><context>カ</context><i>ー</i></x> 199*912701f9SAndroid Build Coastguard Worker # turns into 200*912701f9SAndroid Build Coastguard Worker # =カ|ー 201*912701f9SAndroid Build Coastguard Worker if "<i>" in line: 202*912701f9SAndroid Build Coastguard Worker line = line.replace("<i>", "").replace("<context>", "<i>") 203*912701f9SAndroid Build Coastguard Worker elif "<t>" in line: 204*912701f9SAndroid Build Coastguard Worker line = line.replace("<t>", "").replace("<context>", "<t>") 205*912701f9SAndroid Build Coastguard Worker 206*912701f9SAndroid Build Coastguard Worker for (xml, icu) in replacements: 207*912701f9SAndroid Build Coastguard Worker line = line.replace(xml, icu) 208*912701f9SAndroid Build Coastguard Worker 209*912701f9SAndroid Build Coastguard Worker while True: 210*912701f9SAndroid Build Coastguard Worker # Convert a Numeric Character Reference to \\uhhhh. 211*912701f9SAndroid Build Coastguard Worker i = line.find("&#x") 212*912701f9SAndroid Build Coastguard Worker if i < 0: break 213*912701f9SAndroid Build Coastguard Worker limit = line.find(";", i + 3) 214*912701f9SAndroid Build Coastguard Worker cp = line[i + 3:limit] 215*912701f9SAndroid Build Coastguard Worker while len(cp) < 4: cp = "0" + cp 216*912701f9SAndroid Build Coastguard Worker assert len(cp) == 4 # not handling supplementary code points 217*912701f9SAndroid Build Coastguard Worker line = line[:i] + "\\u" + cp + line[limit + 1:] 218*912701f9SAndroid Build Coastguard Worker 219*912701f9SAndroid Build Coastguard Worker # Start/continue/finish concatenation, and output. 220*912701f9SAndroid Build Coastguard Worker if partial and finish_partial: 221*912701f9SAndroid Build Coastguard Worker # Write collected input. 222*912701f9SAndroid Build Coastguard Worker dest.write(partial + "\n") 223*912701f9SAndroid Build Coastguard Worker partial = "" 224*912701f9SAndroid Build Coastguard Worker 225*912701f9SAndroid Build Coastguard Worker if start_ml_comment: 226*912701f9SAndroid Build Coastguard Worker # Start a multi-line comment. 227*912701f9SAndroid Build Coastguard Worker assert not partial 228*912701f9SAndroid Build Coastguard Worker comment_indent = GetIndent(line) # can be the empty string 229*912701f9SAndroid Build Coastguard Worker in_ml_comment = True 230*912701f9SAndroid Build Coastguard Worker elif in_ml_comment: 231*912701f9SAndroid Build Coastguard Worker # Continue a multi-line comment. 232*912701f9SAndroid Build Coastguard Worker assert not partial 233*912701f9SAndroid Build Coastguard Worker if line.startswith(comment_indent): 234*912701f9SAndroid Build Coastguard Worker if line[len(comment_indent)] in " \t": 235*912701f9SAndroid Build Coastguard Worker # Preserve further indentation. 236*912701f9SAndroid Build Coastguard Worker line = comment_indent + "#" + line[len(comment_indent):] 237*912701f9SAndroid Build Coastguard Worker else: 238*912701f9SAndroid Build Coastguard Worker # Add a space after the #. 239*912701f9SAndroid Build Coastguard Worker line = comment_indent + "# " + line[len(comment_indent):] 240*912701f9SAndroid Build Coastguard Worker else: 241*912701f9SAndroid Build Coastguard Worker # Indent at least as much as the first line. 242*912701f9SAndroid Build Coastguard Worker line = line.lstrip() 243*912701f9SAndroid Build Coastguard Worker if line: 244*912701f9SAndroid Build Coastguard Worker line = comment_indent + "# " + line 245*912701f9SAndroid Build Coastguard Worker else: 246*912701f9SAndroid Build Coastguard Worker line = comment_indent + "#\n" 247*912701f9SAndroid Build Coastguard Worker elif stop_comment: 248*912701f9SAndroid Build Coastguard Worker # Just output the line, do not start collecting input. 249*912701f9SAndroid Build Coastguard Worker # ICU-syntax comments end with the end of the line, 250*912701f9SAndroid Build Coastguard Worker # do not append rules to them. 251*912701f9SAndroid Build Coastguard Worker if partial: 252*912701f9SAndroid Build Coastguard Worker line = partial + line.lstrip() + "\n" 253*912701f9SAndroid Build Coastguard Worker partial = "" 254*912701f9SAndroid Build Coastguard Worker elif not partial: 255*912701f9SAndroid Build Coastguard Worker # Start collecting input. 256*912701f9SAndroid Build Coastguard Worker partial = line.rstrip() 257*912701f9SAndroid Build Coastguard Worker elif partial: 258*912701f9SAndroid Build Coastguard Worker # Continue collecting input. 259*912701f9SAndroid Build Coastguard Worker partial += line.strip() 260*912701f9SAndroid Build Coastguard Worker 261*912701f9SAndroid Build Coastguard Worker if stop_comment: 262*912701f9SAndroid Build Coastguard Worker in_ml_comment = False 263*912701f9SAndroid Build Coastguard Worker if not partial: dest.write(line) 264*912701f9SAndroid Build Coastguard Worker 265*912701f9SAndroid Build Coastguard Worker 266*912701f9SAndroid Build Coastguard Workerdef main(): 267*912701f9SAndroid Build Coastguard Worker (src_root, dest_root) = sys.argv[1:3] 268*912701f9SAndroid Build Coastguard Worker src_pattern = os.path.join(src_root, "*.xml") 269*912701f9SAndroid Build Coastguard Worker for src_path in glob.iglob(src_pattern): 270*912701f9SAndroid Build Coastguard Worker basename = os.path.basename(src_path) 271*912701f9SAndroid Build Coastguard Worker dest_path = os.path.join(dest_root, basename) 272*912701f9SAndroid Build Coastguard Worker with codecs.open(src_path, "r", "UTF-8") as src: 273*912701f9SAndroid Build Coastguard Worker with codecs.open(dest_path, "w", "UTF-8") as dest: 274*912701f9SAndroid Build Coastguard Worker ConvertFile(src, dest) 275*912701f9SAndroid Build Coastguard Worker 276*912701f9SAndroid Build Coastguard Worker 277*912701f9SAndroid Build Coastguard Workerif __name__ == "__main__": 278*912701f9SAndroid Build Coastguard Worker main() 279