scripts/regex/word.sh

#!/bin/sh

# vim: indentexpr= nosmartindent autoindent
# vim: tabstop=2 shiftwidth=2 softtabstop=2

# See the comments in regex/sentence.sh for the general approach to how this
# regex was written.
#
# Writing the regex for this was *hard*. It took me two days of hacking to get
# this far, and that was after I had finished the sentence regex, so my brain
# was fully cached on this. Unlike the sentence regex, the rules in the regex
# below don't correspond as nicely to the rules in UAX #29. In particular, the
# UAX #29 rules have a ton of overlap with each other, which requires crazy
# stuff in the regex. I'm not even sure the regex below is 100% correct or even
# minimal, however, I did compare this with the ICU word segmenter on a few
# different corpora, and it produces identical results. (In addition to of
# course passing the UCD tests.)
#
# In general, I consider this approach to be a failure. Firstly, this is
# clearly a write-only regex. Secondly, building the minimized DFA for this is
# incredibly slow. Thirdly, the DFA is itself very large (~240KB). Fourthly,
# reversing this regex (for reverse word iteration) results in a >19MB DFA.
# Yes. That's MB. Wat. And it took 5 minutes to build.
#
# I think we might consider changing our approach to this problem. The normal
# path I've seen, I think, is to decode codepoints one at a time, and then
# thread them through a state machine in the code itself. We could take this
# approach, or possibly combine it with a DFA that tells us which Word_Break
# value a codepoint has. I'd prefer the latter approach, but it requires adding
# RegexSet support to regex-automata. Something that should definitely be done,
# but is a fair amount of work.
#
# Gah.

CR="\p{wb=CR}"
LF="\p{wb=LF}"
Newline="\p{wb=Newline}"
ZWJ="\p{wb=ZWJ}"
RI="\p{wb=Regional_Indicator}"
Katakana="\p{wb=Katakana}"
HebrewLet="\p{wb=HebrewLetter}"
ALetter="\p{wb=ALetter}"
SingleQuote="\p{wb=SingleQuote}"
DoubleQuote="\p{wb=DoubleQuote}"
MidNumLet="\p{wb=MidNumLet}"
MidLetter="\p{wb=MidLetter}"
MidNum="\p{wb=MidNum}"
Numeric="\p{wb=Numeric}"
ExtendNumLet="\p{wb=ExtendNumLet}"
WSegSpace="\p{wb=WSegSpace}"

Any="\p{any}"
Ex="[\p{wb=Extend} \p{wb=Format} $ZWJ]"
ExtendPict="\p{Extended_Pictographic}"
AHLetter="[$ALetter $HebrewLet]"
MidNumLetQ="[$MidNumLet $SingleQuote]"

AHLetterRepeat="$AHLetter $Ex* ([$MidLetter $MidNumLetQ] $Ex* $AHLetter $Ex*)*"
NumericRepeat="$Numeric $Ex* ([$MidNum $MidNumLetQ] $Ex* $Numeric $Ex*)*"

echo "(?x)
$CR $LF
|
[$Newline $CR $LF]
|
$WSegSpace $WSegSpace+
|
(
  ([^$Newline $CR $LF]? $Ex* $ZWJ $ExtendPict $Ex*)+
  |
  ($ExtendNumLet $Ex*)* $AHLetter $Ex*
    (
      (
        ($NumericRepeat | $ExtendNumLet $Ex*)*
        |
        [$MidLetter $MidNumLetQ] $Ex*
      )
      $AHLetter $Ex*
    )+
    ($NumericRepeat | $ExtendNumLet $Ex*)*
  |
  ($ExtendNumLet $Ex*)* $AHLetter $Ex* ($NumericRepeat | $ExtendNumLet $Ex*)+
  |
  ($ExtendNumLet $Ex*)* $Numeric $Ex*
    (
      (
        ($AHLetterRepeat | $ExtendNumLet $Ex*)*
        |
        [$MidNum $MidNumLetQ] $Ex*
      )
      $Numeric $Ex*
    )+
    ($AHLetterRepeat | $ExtendNumLet $Ex*)*
  |
  ($ExtendNumLet $Ex*)* $Numeric $Ex* ($AHLetterRepeat | $ExtendNumLet $Ex*)+
  |
  $Katakana $Ex*
    (($Katakana | $ExtendNumLet) $Ex*)+
  |
  $ExtendNumLet $Ex*
    (($ExtendNumLet | $AHLetter | $Numeric | $Katakana) $Ex*)+
)+
|
$HebrewLet $Ex* $SingleQuote $Ex*
|
($HebrewLet $Ex* $DoubleQuote $Ex*)+ $HebrewLet $Ex*
|
$RI $Ex* $RI $Ex*
|
$Any $Ex*
"