searchset/tokenizer/tokenizer.go

*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
*46c4c49dSIbrahim Kanouche// limitations under the License.
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Package tokenizer converts a text into a stream of tokens.
*46c4c49dSIbrahim Kanouchepackage tokenizer
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanoucheimport (
*46c4c49dSIbrahim Kanouche	"bytes"
*46c4c49dSIbrahim Kanouche	"fmt"
*46c4c49dSIbrahim Kanouche	"hash/crc32"
*46c4c49dSIbrahim Kanouche	"sort"
*46c4c49dSIbrahim Kanouche	"unicode"
*46c4c49dSIbrahim Kanouche	"unicode/utf8"
*46c4c49dSIbrahim Kanouche)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Token is a non-whitespace sequence (i.e., word or punctuation) in the
*46c4c49dSIbrahim Kanouche// original string. This is not meant for use outside of this package.
*46c4c49dSIbrahim Kanouchetype token struct {
*46c4c49dSIbrahim Kanouche	Text   string
*46c4c49dSIbrahim Kanouche	Offset int
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Tokens is a list of Token objects.
*46c4c49dSIbrahim Kanouchetype Tokens []*token
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// newToken creates a new token object with an invalid (negative) offset, which
*46c4c49dSIbrahim Kanouche// will be set before the token's used.
*46c4c49dSIbrahim Kanouchefunc newToken() *token {
*46c4c49dSIbrahim Kanouche	return &token{Offset: -1}
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Tokenize converts a string into a stream of tokens.
*46c4c49dSIbrahim Kanouchefunc Tokenize(s string) (toks Tokens) {
*46c4c49dSIbrahim Kanouche	tok := newToken()
*46c4c49dSIbrahim Kanouche	for i := 0; i < len(s); {
*46c4c49dSIbrahim Kanouche		r, size := utf8.DecodeRuneInString(s[i:])
*46c4c49dSIbrahim Kanouche		switch {
*46c4c49dSIbrahim Kanouche		case unicode.IsSpace(r):
*46c4c49dSIbrahim Kanouche			if tok.Offset >= 0 {
*46c4c49dSIbrahim Kanouche				toks = append(toks, tok)
*46c4c49dSIbrahim Kanouche				tok = newToken()
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		case unicode.IsPunct(r):
*46c4c49dSIbrahim Kanouche			if tok.Offset >= 0 {
*46c4c49dSIbrahim Kanouche				toks = append(toks, tok)
*46c4c49dSIbrahim Kanouche				tok = newToken()
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			toks = append(toks, &token{
*46c4c49dSIbrahim Kanouche				Text:   string(r),
*46c4c49dSIbrahim Kanouche				Offset: i,
*46c4c49dSIbrahim Kanouche			})
*46c4c49dSIbrahim Kanouche		default:
*46c4c49dSIbrahim Kanouche			if tok.Offset == -1 {
*46c4c49dSIbrahim Kanouche				tok.Offset = i
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			tok.Text += string(r)
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		i += size
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	if tok.Offset != -1 {
*46c4c49dSIbrahim Kanouche		// Add any remaining token that wasn't yet included in the list.
*46c4c49dSIbrahim Kanouche		toks = append(toks, tok)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return toks
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// GenerateHashes generates hashes for "size" length substrings. The
*46c4c49dSIbrahim Kanouche// "stringifyTokens" call takes a long time to run, so not all substrings have
*46c4c49dSIbrahim Kanouche// hashes, i.e. we skip some of the smaller substrings.
*46c4c49dSIbrahim Kanouchefunc (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) {
*46c4c49dSIbrahim Kanouche	if size == 0 {
*46c4c49dSIbrahim Kanouche		return nil, nil
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	var css []uint32
*46c4c49dSIbrahim Kanouche	var tr TokenRanges
*46c4c49dSIbrahim Kanouche	for offset := 0; offset+size <= len(t); offset += size / 2 {
*46c4c49dSIbrahim Kanouche		var b bytes.Buffer
*46c4c49dSIbrahim Kanouche		t.stringifyTokens(&b, offset, size)
*46c4c49dSIbrahim Kanouche		cs := crc32.ChecksumIEEE(b.Bytes())
*46c4c49dSIbrahim Kanouche		css = append(css, cs)
*46c4c49dSIbrahim Kanouche		tr = append(tr, &TokenRange{offset, offset + size})
*46c4c49dSIbrahim Kanouche		h.add(cs, offset, offset+size)
*46c4c49dSIbrahim Kanouche		if size <= 1 {
*46c4c49dSIbrahim Kanouche			break
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	return css, tr
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// stringifyTokens serializes a sublist of tokens into a bytes buffer.
*46c4c49dSIbrahim Kanouchefunc (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) {
*46c4c49dSIbrahim Kanouche	for j := offset; j < offset+size; j++ {
*46c4c49dSIbrahim Kanouche		if j != offset {
*46c4c49dSIbrahim Kanouche			b.WriteRune(' ')
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		b.WriteString(t[j].Text)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// TokenRange indicates the range of tokens that map to a particular checksum.
*46c4c49dSIbrahim Kanouchetype TokenRange struct {
*46c4c49dSIbrahim Kanouche	Start int
*46c4c49dSIbrahim Kanouche	End   int
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc (t *TokenRange) String() string {
*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// TokenRanges is a list of TokenRange objects. The chance that two different
*46c4c49dSIbrahim Kanouche// strings map to the same checksum is very small, but unfortunately isn't
*46c4c49dSIbrahim Kanouche// zero, so we use this instead of making the assumption that they will all be
*46c4c49dSIbrahim Kanouche// unique.
*46c4c49dSIbrahim Kanouchetype TokenRanges []*TokenRange
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Len() int           { return len(t) }
*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start }
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// CombineUnique returns the combination of both token ranges with no duplicates.
*46c4c49dSIbrahim Kanouchefunc (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges {
*46c4c49dSIbrahim Kanouche	if len(other) == 0 {
*46c4c49dSIbrahim Kanouche		return t
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	if len(t) == 0 {
*46c4c49dSIbrahim Kanouche		return other
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	cu := append(t, other...)
*46c4c49dSIbrahim Kanouche	sort.Sort(cu)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	if len(cu) == 0 {
*46c4c49dSIbrahim Kanouche		return nil
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	res := TokenRanges{cu[0]}
*46c4c49dSIbrahim Kanouche	for prev, i := cu[0], 1; i < len(cu); i++ {
*46c4c49dSIbrahim Kanouche		if prev.Start != cu[i].Start || prev.End != cu[i].End {
*46c4c49dSIbrahim Kanouche			res = append(res, cu[i])
*46c4c49dSIbrahim Kanouche			prev = cu[i]
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return res
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Hash is a map of the hashes of a section of text to the token range covering that text.
*46c4c49dSIbrahim Kanouchetype Hash map[uint32]TokenRanges
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// add associates a token range, [start, end], to a checksum.
*46c4c49dSIbrahim Kanouchefunc (h Hash) add(checksum uint32, start, end int) {
*46c4c49dSIbrahim Kanouche	ntr := &TokenRange{Start: start, End: end}
*46c4c49dSIbrahim Kanouche	if r, ok := h[checksum]; ok {
*46c4c49dSIbrahim Kanouche		for _, tr := range r {
*46c4c49dSIbrahim Kanouche			if tr.Start == ntr.Start && tr.End == ntr.End {
*46c4c49dSIbrahim Kanouche				// The token range already exists at this
*46c4c49dSIbrahim Kanouche				// checksum. No need to re-add it.
*46c4c49dSIbrahim Kanouche				return
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	h[checksum] = append(h[checksum], ntr)
*46c4c49dSIbrahim Kanouche}