1#!/bin/sh 2 3# vim: indentexpr= nosmartindent autoindent 4# vim: tabstop=2 shiftwidth=2 softtabstop=2 5 6# See the comments in regex/sentence.sh for the general approach to how this 7# regex was written. 8# 9# Writing the regex for this was *hard*. It took me two days of hacking to get 10# this far, and that was after I had finished the sentence regex, so my brain 11# was fully cached on this. Unlike the sentence regex, the rules in the regex 12# below don't correspond as nicely to the rules in UAX #29. In particular, the 13# UAX #29 rules have a ton of overlap with each other, which requires crazy 14# stuff in the regex. I'm not even sure the regex below is 100% correct or even 15# minimal, however, I did compare this with the ICU word segmenter on a few 16# different corpora, and it produces identical results. (In addition to of 17# course passing the UCD tests.) 18# 19# In general, I consider this approach to be a failure. Firstly, this is 20# clearly a write-only regex. Secondly, building the minimized DFA for this is 21# incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly, 22# reversing this regex (for reverse word iteration) results in a >19MB DFA. 23# Yes. That's MB. Wat. And it took 5 minutes to build. 24# 25# I think we might consider changing our approach to this problem. The normal 26# path I've seen, I think, is to decode codepoints one at a time, and then 27# thread them through a state machine in the code itself. We could take this 28# approach, or possibly combine it with a DFA that tells us which Word_Break 29# value a codepoint has. I'd prefer the latter approach, but it requires adding 30# RegexSet support to regex-automata. Something that should definitely be done, 31# but is a fair amount of work. 32# 33# Gah. 34 35CR="\p{wb=CR}" 36LF="\p{wb=LF}" 37Newline="\p{wb=Newline}" 38ZWJ="\p{wb=ZWJ}" 39RI="\p{wb=Regional_Indicator}" 40Katakana="\p{wb=Katakana}" 41HebrewLet="\p{wb=HebrewLetter}" 42ALetter="\p{wb=ALetter}" 43SingleQuote="\p{wb=SingleQuote}" 44DoubleQuote="\p{wb=DoubleQuote}" 45MidNumLet="\p{wb=MidNumLet}" 46MidLetter="\p{wb=MidLetter}" 47MidNum="\p{wb=MidNum}" 48Numeric="\p{wb=Numeric}" 49ExtendNumLet="\p{wb=ExtendNumLet}" 50WSegSpace="\p{wb=WSegSpace}" 51 52Any="\p{any}" 53Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]" 54ExtendPict="\p{Extended_Pictographic}" 55AHLetter="[$ALetter $HebrewLet]" 56MidNumLetQ="[$MidNumLet $SingleQuote]" 57 58AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*" 59NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*" 60 61echo "(?x) 62$CR $LF 63| 64[$Newline $CR $LF] 65| 66$WSegSpace $WSegSpace+ 67| 68( 69 ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+ 70 | 71 ($ExtendNumLet $Ex*)* $AHLetter $Ex* 72 ( 73 ( 74 ($NumericRepeat | $ExtendNumLet $Ex*)* 75 | 76 [$MidLetter $MidNumLetQ] $Ex* 77 ) 78 $AHLetter $Ex* 79 )+ 80 ($NumericRepeat | $ExtendNumLet $Ex*)* 81 | 82 ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+ 83 | 84 ($ExtendNumLet $Ex*)* $Numeric $Ex* 85 ( 86 ( 87 ($AHLetterRepeat | $ExtendNumLet $Ex*)* 88 | 89 [$MidNum $MidNumLetQ] $Ex* 90 ) 91 $Numeric $Ex* 92 )+ 93 ($AHLetterRepeat | $ExtendNumLet $Ex*)* 94 | 95 ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+ 96 | 97 $Katakana $Ex* 98 (($Katakana | $ExtendNumLet) $Ex*)+ 99 | 100 $ExtendNumLet $Ex* 101 (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+ 102)+ 103| 104$HebrewLet $Ex* $SingleQuote $Ex* 105| 106($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex* 107| 108$RI $Ex* $RI $Ex* 109| 110$Any $Ex* 111" 112