1// run 2 3// Copyright 2024 The Go Authors. All rights reserved. 4// Use of this source code is governed by a BSD-style 5// license that can be found in the LICENSE file. 6 7package main 8 9import ( 10 "bufio" 11 "fmt" 12 "io" 13 "iter" 14 "math/rand" 15 "os" 16 "strings" 17 "unicode" 18) 19 20// WordReader is the struct that implements io.Reader 21type WordReader struct { 22 scanner *bufio.Scanner 23} 24 25// NewWordReader creates a new WordReader from an io.Reader 26func NewWordReader(r io.Reader) *WordReader { 27 scanner := bufio.NewScanner(r) 28 scanner.Split(bufio.ScanWords) 29 return &WordReader{ 30 scanner: scanner, 31 } 32} 33 34// Read reads data from the input stream and returns a single lowercase word at a time 35func (wr *WordReader) Read(p []byte) (n int, err error) { 36 if !wr.scanner.Scan() { 37 if err := wr.scanner.Err(); err != nil { 38 return 0, err 39 } 40 return 0, io.EOF 41 } 42 word := wr.scanner.Text() 43 cleanedWord := removeNonAlphabetic(word) 44 if len(cleanedWord) == 0 { 45 return wr.Read(p) 46 } 47 n = copy(p, []byte(cleanedWord)) 48 return n, nil 49} 50 51// All returns an iterator allowing the caller to iterate over the WordReader using for/range. 52func (wr *WordReader) All() iter.Seq[string] { 53 word := make([]byte, 1024) 54 return func(yield func(string) bool) { 55 var err error 56 var n int 57 for n, err = wr.Read(word); err == nil; n, err = wr.Read(word) { 58 if !yield(string(word[:n])) { 59 return 60 } 61 } 62 if err != io.EOF { 63 fmt.Fprintf(os.Stderr, "error reading word: %v\n", err) 64 } 65 } 66} 67 68// removeNonAlphabetic removes non-alphabetic characters from a word using strings.Map 69func removeNonAlphabetic(word string) string { 70 return strings.Map(func(r rune) rune { 71 if unicode.IsLetter(r) { 72 return unicode.ToLower(r) 73 } 74 return -1 75 }, word) 76} 77 78// ProbabilisticSkipper determines if an item should be retained with probability 1/(1<<n) 79type ProbabilisticSkipper struct { 80 n int 81 counter uint64 82 bitmask uint64 83} 84 85// NewProbabilisticSkipper initializes the ProbabilisticSkipper 86func NewProbabilisticSkipper(n int) *ProbabilisticSkipper { 87 pr := &ProbabilisticSkipper{n: n} 88 pr.refreshCounter() 89 return pr 90} 91 92// check panics if pr.n is not the expected value 93func (pr *ProbabilisticSkipper) check(n int) { 94 if pr.n != n { 95 panic(fmt.Sprintf("check: pr.n != n %d != %d", pr.n, n)) 96 } 97} 98 99// refreshCounter refreshes the counter with a new random value 100func (pr *ProbabilisticSkipper) refreshCounter() { 101 if pr.n == 0 { 102 pr.bitmask = ^uint64(0) // All bits set to 1 103 } else { 104 pr.bitmask = rand.Uint64() 105 for i := 0; i < pr.n-1; i++ { 106 pr.bitmask &= rand.Uint64() 107 } 108 } 109 pr.counter = 64 110} 111 112// ShouldSkip returns true with probability 1/(1<<n) 113func (pr *ProbabilisticSkipper) ShouldSkip() bool { 114 remove := pr.bitmask&1 == 0 115 pr.bitmask >>= 1 116 pr.counter-- 117 if pr.counter == 0 { 118 pr.refreshCounter() 119 } 120 return remove 121} 122 123// EstimateUniqueWordsIter estimates the number of unique words using a probabilistic counting method 124func EstimateUniqueWordsIter(reader io.Reader, memorySize int) int { 125 wordReader := NewWordReader(reader) 126 words := make(map[string]struct{}, memorySize) 127 128 rounds := 0 129 roundRemover := NewProbabilisticSkipper(1) 130 wordSkipper := NewProbabilisticSkipper(rounds) 131 wordSkipper.check(rounds) 132 133 for word := range wordReader.All() { 134 wordSkipper.check(rounds) 135 if wordSkipper.ShouldSkip() { 136 delete(words, word) 137 } else { 138 words[word] = struct{}{} 139 140 if len(words) >= memorySize { 141 rounds++ 142 143 wordSkipper = NewProbabilisticSkipper(rounds) 144 for w := range words { 145 if roundRemover.ShouldSkip() { 146 delete(words, w) 147 } 148 } 149 } 150 } 151 wordSkipper.check(rounds) 152 } 153 154 if len(words) == 0 { 155 return 0 156 } 157 158 invProbability := 1 << rounds 159 estimatedUniqueWords := len(words) * invProbability 160 return estimatedUniqueWords 161} 162 163func main() { 164 input := "Hello, world! This is a test. Hello, world, hello!" 165 expectedUniqueWords := 6 // "hello", "world", "this", "is", "a", "test" (but "hello" and "world" are repeated) 166 memorySize := 6 167 168 reader := strings.NewReader(input) 169 estimatedUniqueWords := EstimateUniqueWordsIter(reader, memorySize) 170 if estimatedUniqueWords != expectedUniqueWords { 171 // ... 172 } 173} 174