1#!/bin/sh
2
3# vim: indentexpr= nosmartindent autoindent
4# vim: tabstop=2 shiftwidth=2 softtabstop=2
5
6# See the comments in regex/sentence.sh for the general approach to how this
7# regex was written.
8#
9# Writing the regex for this was *hard*. It took me two days of hacking to get
10# this far, and that was after I had finished the sentence regex, so my brain
11# was fully cached on this. Unlike the sentence regex, the rules in the regex
12# below don't correspond as nicely to the rules in UAX #29. In particular, the
13# UAX #29 rules have a ton of overlap with each other, which requires crazy
14# stuff in the regex. I'm not even sure the regex below is 100% correct or even
15# minimal, however, I did compare this with the ICU word segmenter on a few
16# different corpora, and it produces identical results. (In addition to of
17# course passing the UCD tests.)
18#
19# In general, I consider this approach to be a failure. Firstly, this is
20# clearly a write-only regex. Secondly, building the minimized DFA for this is
21# incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly,
22# reversing this regex (for reverse word iteration) results in a >19MB DFA.
23# Yes. That's MB. Wat. And it took 5 minutes to build.
24#
25# I think we might consider changing our approach to this problem. The normal
26# path I've seen, I think, is to decode codepoints one at a time, and then
27# thread them through a state machine in the code itself. We could take this
28# approach, or possibly combine it with a DFA that tells us which Word_Break
29# value a codepoint has. I'd prefer the latter approach, but it requires adding
30# RegexSet support to regex-automata. Something that should definitely be done,
31# but is a fair amount of work.
32#
33# Gah.
34
35CR="\p{wb=CR}"
36LF="\p{wb=LF}"
37Newline="\p{wb=Newline}"
38ZWJ="\p{wb=ZWJ}"
39RI="\p{wb=Regional_Indicator}"
40Katakana="\p{wb=Katakana}"
41HebrewLet="\p{wb=HebrewLetter}"
42ALetter="\p{wb=ALetter}"
43SingleQuote="\p{wb=SingleQuote}"
44DoubleQuote="\p{wb=DoubleQuote}"
45MidNumLet="\p{wb=MidNumLet}"
46MidLetter="\p{wb=MidLetter}"
47MidNum="\p{wb=MidNum}"
48Numeric="\p{wb=Numeric}"
49ExtendNumLet="\p{wb=ExtendNumLet}"
50WSegSpace="\p{wb=WSegSpace}"
51
52Any="\p{any}"
53Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]"
54ExtendPict="\p{Extended_Pictographic}"
55AHLetter="[$ALetter $HebrewLet]"
56MidNumLetQ="[$MidNumLet $SingleQuote]"
57
58AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*"
59NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*"
60
61echo "(?x)
62$CR $LF
63|
64[$Newline $CR $LF]
65|
66$WSegSpace $WSegSpace+
67|
68(
69  ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+
70  |
71  ($ExtendNumLet $Ex*)* $AHLetter $Ex*
72    (
73      (
74        ($NumericRepeat | $ExtendNumLet $Ex*)*
75        |
76        [$MidLetter $MidNumLetQ] $Ex*
77      )
78      $AHLetter $Ex*
79    )+
80    ($NumericRepeat | $ExtendNumLet $Ex*)*
81  |
82  ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+
83  |
84  ($ExtendNumLet $Ex*)* $Numeric $Ex*
85    (
86      (
87        ($AHLetterRepeat | $ExtendNumLet $Ex*)*
88        |
89        [$MidNum $MidNumLetQ] $Ex*
90      )
91      $Numeric $Ex*
92    )+
93    ($AHLetterRepeat | $ExtendNumLet $Ex*)*
94  |
95  ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+
96  |
97  $Katakana $Ex*
98    (($Katakana | $ExtendNumLet) $Ex*)+
99  |
100  $ExtendNumLet $Ex*
101    (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+
102)+
103|
104$HebrewLet $Ex* $SingleQuote $Ex*
105|
106($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex*
107|
108$RI $Ex* $RI $Ex*
109|
110$Any $Ex*
111"
112