You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ntool/nstr/ac/ahocorasick.go

387 lines
9.9 KiB
Go

1 year ago
package ac
import (
"strings"
"sync"
"unicode"
)
type findIter struct {
fsm imp
prestate *prefilterState
haystack []byte
pos int
matchOnlyWholeWords bool
}
// Iter is an iterator over matches found on the current haystack
// it gives the user more granular control. You can choose how many and what kind of matches you need.
type Iter interface {
Next() *Match
}
// Next gives a pointer to the next match yielded by the iterator or nil, if there is none
func (f *findIter) Next() *Match {
if f.pos > len(f.haystack) {
return nil
}
result := f.fsm.FindAtNoState(f.prestate, f.haystack, f.pos)
if result == nil {
return nil
}
if result.end == f.pos {
f.pos += 1
} else {
f.pos = result.end
}
if f.matchOnlyWholeWords {
if result.Start()-1 >= 0 && (unicode.IsLetter(rune(f.haystack[result.Start()-1])) || unicode.IsDigit(rune(f.haystack[result.Start()-1]))) {
return f.Next()
}
if result.end < len(f.haystack) && (unicode.IsLetter(rune(f.haystack[result.end])) || unicode.IsDigit(rune(f.haystack[result.end]))) {
return f.Next()
}
}
return result
}
type overlappingIter struct {
fsm imp
prestate *prefilterState
haystack []byte
pos int
stateID stateID
matchIndex int
matchOnlyWholeWords bool
}
func (f *overlappingIter) Next() *Match {
if f.pos > len(f.haystack) {
return nil
}
result := f.fsm.OverlappingFindAt(f.prestate, f.haystack, f.pos, &f.stateID, &f.matchIndex)
if result == nil {
return nil
}
f.pos = result.End()
if f.matchOnlyWholeWords {
if result.Start()-1 >= 0 && (unicode.IsLetter(rune(f.haystack[result.Start()-1])) || unicode.IsDigit(rune(f.haystack[result.Start()-1]))) {
return f.Next()
}
if result.end < len(f.haystack) && (unicode.IsLetter(rune(f.haystack[result.end])) || unicode.IsDigit(rune(f.haystack[result.end]))) {
return f.Next()
}
}
return result
}
func newOverlappingIter(ac AhoCorasick, haystack []byte) overlappingIter {
prestate := prefilterState{
skips: 0,
skipped: 0,
maxMatchLen: ac.i.MaxPatternLen(),
inert: false,
lastScanAt: 0,
}
return overlappingIter{
fsm: ac.i,
prestate: &prestate,
haystack: haystack,
pos: 0,
stateID: ac.i.StartState(),
matchIndex: 0,
matchOnlyWholeWords: ac.matchOnlyWholeWords,
}
}
// make sure the AhoCorasick data structure implements the Finder interface
var _ Finder = (*AhoCorasick)(nil)
// AhoCorasick is the main data structure that does most of the work
type AhoCorasick struct {
i imp
matchKind matchKind
matchOnlyWholeWords bool
}
func (ac AhoCorasick) PatternCount() int {
return ac.i.PatternCount()
}
// Iter gives an iterator over the built patterns
func (ac AhoCorasick) Iter(haystack string) Iter {
return ac.IterByte([]byte(haystack))
}
// IterByte gives an iterator over the built patterns
func (ac AhoCorasick) IterByte(haystack []byte) Iter {
prestate := &prefilterState{
skips: 0,
skipped: 0,
maxMatchLen: ac.i.MaxPatternLen(),
inert: false,
lastScanAt: 0,
}
return &findIter{
fsm: ac.i,
prestate: prestate,
haystack: haystack,
pos: 0,
matchOnlyWholeWords: ac.matchOnlyWholeWords,
}
}
// IterOverlapping gives an iterator over the built patterns with overlapping matches
func (ac AhoCorasick) IterOverlapping(haystack string) Iter {
return ac.IterOverlappingByte([]byte(haystack))
}
// IterOverlappingByte gives an iterator over the built patterns with overlapping matches
func (ac AhoCorasick) IterOverlappingByte(haystack []byte) Iter {
if ac.matchKind != StandardMatch {
panic("only StandardMatch allowed for overlapping matches")
}
i := newOverlappingIter(ac, haystack)
return &i
}
var pool = sync.Pool{
New: func() any {
1 year ago
return strings.Builder{}
},
}
type Replacer struct {
finder Finder
}
//goland:noinspection GoUnusedExportedFunction
func NewReplacer(finder Finder) Replacer {
return Replacer{finder: finder}
}
// ReplaceAllFunc replaces the matches found in the haystack according to the user provided function
// it gives fine-grained control over what is replaced.
// A user can choose to stop the replacing process early by returning false in the lambda
// In that case, everything from that point will be kept as the original haystack
func (r Replacer) ReplaceAllFunc(haystack string, f func(match Match) (string, bool)) string {
matches := r.finder.FindAll(haystack)
if len(matches) == 0 {
return haystack
}
replaceWith := make([]string, 0)
for _, match := range matches {
rw, ok := f(match)
if !ok {
break
}
replaceWith = append(replaceWith, rw)
}
str := pool.Get().(strings.Builder)
defer func() {
str.Reset()
pool.Put(str)
}()
start := 0
for i, match := range matches {
if i >= len(replaceWith) {
str.WriteString(haystack[start:])
return str.String()
}
str.WriteString(haystack[start:match.Start()])
str.WriteString(replaceWith[i])
start = match.Start() + match.len
}
if start-1 < len(haystack) {
str.WriteString(haystack[start:])
}
return str.String()
}
// ReplaceAll replaces the matches found in the haystack according to the user provided slice `replaceWith`
// It panics, if `replaceWith` has length different from the patterns that it was built with
func (r Replacer) ReplaceAll(haystack string, replaceWith []string) string {
if len(replaceWith) != r.finder.PatternCount() {
panic("replaceWith needs to have the same length as the pattern count")
}
return r.ReplaceAllFunc(haystack, func(match Match) (string, bool) {
return replaceWith[match.pattern], true
})
}
type Finder interface {
FindAll(haystack string) []Match
PatternCount() int
}
// FindAll returns the matches found in the haystack
func (ac AhoCorasick) FindAll(haystack string) []Match {
iter := ac.Iter(haystack)
matches := make([]Match, 0)
for {
next := iter.Next()
if next == nil {
break
}
matches = append(matches, *next)
}
return matches
}
// AhoCorasickBuilder defines a set of options applied before the patterns are built
type AhoCorasickBuilder struct {
dfaBuilder *iDFABuilder
nfaBuilder *iNFABuilder
dfa bool
matchOnlyWholeWords bool
}
// Opts defines a set of options applied before the patterns are built
type Opts struct {
AsciiCaseInsensitive bool
MatchOnlyWholeWords bool
MatchKind matchKind
DFA bool
}
// NewAhoCorasickBuilder creates a new AhoCorasickBuilder based on Opts
//goland:noinspection GoUnusedExportedFunction
func NewAhoCorasickBuilder(o Opts) AhoCorasickBuilder {
return AhoCorasickBuilder{
dfaBuilder: newDFABuilder(),
nfaBuilder: newNFABuilder(o.MatchKind, o.AsciiCaseInsensitive),
dfa: o.DFA,
matchOnlyWholeWords: o.MatchOnlyWholeWords,
}
}
// Build builds a (non)deterministic finite automata from the user provided patterns
func (a *AhoCorasickBuilder) Build(patterns []string) AhoCorasick {
bytePatterns := make([][]byte, len(patterns))
for i, pat := range patterns {
bytePatterns[i] = []byte(pat)
}
return a.BuildByte(bytePatterns)
}
// BuildByte builds a (non)deterministic finite automata from the user provided patterns
func (a *AhoCorasickBuilder) BuildByte(patterns [][]byte) AhoCorasick {
nfa := a.nfaBuilder.build(patterns)
kind := nfa.matchKind
if a.dfa {
dfa := a.dfaBuilder.build(nfa)
return AhoCorasick{dfa, kind, a.matchOnlyWholeWords}
}
return AhoCorasick{nfa, kind, a.matchOnlyWholeWords}
}
type imp interface {
MatchKind() *matchKind
StartState() stateID
MaxPatternLen() int
PatternCount() int
Prefilter() prefilter
UsePrefilter() bool
OverlappingFindAt(prestate *prefilterState, haystack []byte, at int, stateId *stateID, matchIndex *int) *Match
EarliestFindAt(prestate *prefilterState, haystack []byte, at int, stateId *stateID) *Match
FindAtNoState(prestate *prefilterState, haystack []byte, at int) *Match
}
type matchKind int
const (
// StandardMatch Use standard match semantics, which support overlapping matches. When
// used with non-overlapping matches, matches are reported as they are seen.
StandardMatch matchKind = iota
// LeftMostFirstMatch Use leftmost-first match semantics, which reports leftmost matches.
// When there are multiple possible leftmost matches, the match
// corresponding to the pattern that appeared earlier when constructing
// the automaton is reported.
// This does **not** support overlapping matches or stream searching
LeftMostFirstMatch
// LeftMostLongestMatch Use leftmost-longest match semantics, which reports leftmost matches.
// When there are multiple possible leftmost matches, the longest match is chosen.
LeftMostLongestMatch
)
func (m matchKind) supportsOverlapping() bool {
return m.isStandard()
}
func (m matchKind) supportsStream() bool {
return m.isStandard()
}
func (m matchKind) isStandard() bool {
return m == StandardMatch
}
func (m matchKind) isLeftmost() bool {
return m == LeftMostFirstMatch || m == LeftMostLongestMatch
}
func (m matchKind) isLeftmostFirst() bool {
return m == LeftMostFirstMatch
}
// Match A representation of a match reported by an Aho-Corasick automaton.
//
// A match has two essential pieces of information: the identifier of the
// pattern that matched, along with the start and end offsets of the match
// in the haystack.
type Match struct {
pattern int
len int
end int
}
// Pattern returns the index of the pattern in the slice of the patterns provided by the user that
// was matched
func (m *Match) Pattern() int {
return m.pattern
}
// End gives the index of the last character of this match inside the haystack
func (m *Match) End() int {
return m.end
}
// Start gives the index of the first character of this match inside the haystack
func (m *Match) Start() int {
return m.end - m.len
}
type stateID uint
const (
failedStateID stateID = 0
deadStateID stateID = 1
)