1// run
2
3// Copyright 2024 The Go Authors. All rights reserved.
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6
7package main
8
9import (
10	"bufio"
11	"fmt"
12	"io"
13	"iter"
14	"math/rand"
15	"os"
16	"strings"
17	"unicode"
18)
19
20// WordReader is the struct that implements io.Reader
21type WordReader struct {
22	scanner *bufio.Scanner
23}
24
25// NewWordReader creates a new WordReader from an io.Reader
26func NewWordReader(r io.Reader) *WordReader {
27	scanner := bufio.NewScanner(r)
28	scanner.Split(bufio.ScanWords)
29	return &WordReader{
30		scanner: scanner,
31	}
32}
33
34// Read reads data from the input stream and returns a single lowercase word at a time
35func (wr *WordReader) Read(p []byte) (n int, err error) {
36	if !wr.scanner.Scan() {
37		if err := wr.scanner.Err(); err != nil {
38			return 0, err
39		}
40		return 0, io.EOF
41	}
42	word := wr.scanner.Text()
43	cleanedWord := removeNonAlphabetic(word)
44	if len(cleanedWord) == 0 {
45		return wr.Read(p)
46	}
47	n = copy(p, []byte(cleanedWord))
48	return n, nil
49}
50
51// All returns an iterator allowing the caller to iterate over the WordReader using for/range.
52func (wr *WordReader) All() iter.Seq[string] {
53	word := make([]byte, 1024)
54	return func(yield func(string) bool) {
55		var err error
56		var n int
57		for n, err = wr.Read(word); err == nil; n, err = wr.Read(word) {
58			if !yield(string(word[:n])) {
59				return
60			}
61		}
62		if err != io.EOF {
63			fmt.Fprintf(os.Stderr, "error reading word: %v\n", err)
64		}
65	}
66}
67
68// removeNonAlphabetic removes non-alphabetic characters from a word using strings.Map
69func removeNonAlphabetic(word string) string {
70	return strings.Map(func(r rune) rune {
71		if unicode.IsLetter(r) {
72			return unicode.ToLower(r)
73		}
74		return -1
75	}, word)
76}
77
78// ProbabilisticSkipper determines if an item should be retained with probability 1/(1<<n)
79type ProbabilisticSkipper struct {
80	n       int
81	counter uint64
82	bitmask uint64
83}
84
85// NewProbabilisticSkipper initializes the ProbabilisticSkipper
86func NewProbabilisticSkipper(n int) *ProbabilisticSkipper {
87	pr := &ProbabilisticSkipper{n: n}
88	pr.refreshCounter()
89	return pr
90}
91
92// check panics if pr.n is not the expected value
93func (pr *ProbabilisticSkipper) check(n int) {
94	if pr.n != n {
95		panic(fmt.Sprintf("check: pr.n != n  %d != %d", pr.n, n))
96	}
97}
98
99// refreshCounter refreshes the counter with a new random value
100func (pr *ProbabilisticSkipper) refreshCounter() {
101	if pr.n == 0 {
102		pr.bitmask = ^uint64(0) // All bits set to 1
103	} else {
104		pr.bitmask = rand.Uint64()
105		for i := 0; i < pr.n-1; i++ {
106			pr.bitmask &= rand.Uint64()
107		}
108	}
109	pr.counter = 64
110}
111
112// ShouldSkip returns true with probability 1/(1<<n)
113func (pr *ProbabilisticSkipper) ShouldSkip() bool {
114	remove := pr.bitmask&1 == 0
115	pr.bitmask >>= 1
116	pr.counter--
117	if pr.counter == 0 {
118		pr.refreshCounter()
119	}
120	return remove
121}
122
123// EstimateUniqueWordsIter estimates the number of unique words using a probabilistic counting method
124func EstimateUniqueWordsIter(reader io.Reader, memorySize int) int {
125	wordReader := NewWordReader(reader)
126	words := make(map[string]struct{}, memorySize)
127
128	rounds := 0
129	roundRemover := NewProbabilisticSkipper(1)
130	wordSkipper := NewProbabilisticSkipper(rounds)
131	wordSkipper.check(rounds)
132
133	for word := range wordReader.All() {
134		wordSkipper.check(rounds)
135		if wordSkipper.ShouldSkip() {
136			delete(words, word)
137		} else {
138			words[word] = struct{}{}
139
140			if len(words) >= memorySize {
141				rounds++
142
143				wordSkipper = NewProbabilisticSkipper(rounds)
144				for w := range words {
145					if roundRemover.ShouldSkip() {
146						delete(words, w)
147					}
148				}
149			}
150		}
151		wordSkipper.check(rounds)
152	}
153
154	if len(words) == 0 {
155		return 0
156	}
157
158	invProbability := 1 << rounds
159	estimatedUniqueWords := len(words) * invProbability
160	return estimatedUniqueWords
161}
162
163func main() {
164	input := "Hello, world! This is a test. Hello, world, hello!"
165	expectedUniqueWords := 6 // "hello", "world", "this", "is", "a", "test" (but "hello" and "world" are repeated)
166	memorySize := 6
167
168	reader := strings.NewReader(input)
169	estimatedUniqueWords := EstimateUniqueWordsIter(reader, memorySize)
170	if estimatedUniqueWords != expectedUniqueWords {
171		// ...
172	}
173}
174