package ac import ( "strings" "sync" "unicode" ) type findIter struct { fsm imp prestate *prefilterState haystack []byte pos int matchOnlyWholeWords bool } // Iter is an iterator over matches found on the current haystack // it gives the user more granular control. You can choose how many and what kind of matches you need. type Iter interface { Next() *Match } // Next gives a pointer to the next match yielded by the iterator or nil, if there is none func (f *findIter) Next() *Match { if f.pos > len(f.haystack) { return nil } result := f.fsm.FindAtNoState(f.prestate, f.haystack, f.pos) if result == nil { return nil } if result.end == f.pos { f.pos += 1 } else { f.pos = result.end } if f.matchOnlyWholeWords { if result.Start()-1 >= 0 && (unicode.IsLetter(rune(f.haystack[result.Start()-1])) || unicode.IsDigit(rune(f.haystack[result.Start()-1]))) { return f.Next() } if result.end < len(f.haystack) && (unicode.IsLetter(rune(f.haystack[result.end])) || unicode.IsDigit(rune(f.haystack[result.end]))) { return f.Next() } } return result } type overlappingIter struct { fsm imp prestate *prefilterState haystack []byte pos int stateID stateID matchIndex int matchOnlyWholeWords bool } func (f *overlappingIter) Next() *Match { if f.pos > len(f.haystack) { return nil } result := f.fsm.OverlappingFindAt(f.prestate, f.haystack, f.pos, &f.stateID, &f.matchIndex) if result == nil { return nil } f.pos = result.End() if f.matchOnlyWholeWords { if result.Start()-1 >= 0 && (unicode.IsLetter(rune(f.haystack[result.Start()-1])) || unicode.IsDigit(rune(f.haystack[result.Start()-1]))) { return f.Next() } if result.end < len(f.haystack) && (unicode.IsLetter(rune(f.haystack[result.end])) || unicode.IsDigit(rune(f.haystack[result.end]))) { return f.Next() } } return result } func newOverlappingIter(ac AhoCorasick, haystack []byte) overlappingIter { prestate := prefilterState{ skips: 0, skipped: 0, maxMatchLen: ac.i.MaxPatternLen(), inert: false, lastScanAt: 0, } return overlappingIter{ fsm: ac.i, prestate: &prestate, haystack: haystack, pos: 0, stateID: ac.i.StartState(), matchIndex: 0, matchOnlyWholeWords: ac.matchOnlyWholeWords, } } // make sure the AhoCorasick data structure implements the Finder interface var _ Finder = (*AhoCorasick)(nil) // AhoCorasick is the main data structure that does most of the work type AhoCorasick struct { i imp matchKind matchKind matchOnlyWholeWords bool } func (ac AhoCorasick) PatternCount() int { return ac.i.PatternCount() } // Iter gives an iterator over the built patterns func (ac AhoCorasick) Iter(haystack string) Iter { return ac.IterByte([]byte(haystack)) } // IterByte gives an iterator over the built patterns func (ac AhoCorasick) IterByte(haystack []byte) Iter { prestate := &prefilterState{ skips: 0, skipped: 0, maxMatchLen: ac.i.MaxPatternLen(), inert: false, lastScanAt: 0, } return &findIter{ fsm: ac.i, prestate: prestate, haystack: haystack, pos: 0, matchOnlyWholeWords: ac.matchOnlyWholeWords, } } // IterOverlapping gives an iterator over the built patterns with overlapping matches func (ac AhoCorasick) IterOverlapping(haystack string) Iter { return ac.IterOverlappingByte([]byte(haystack)) } // IterOverlappingByte gives an iterator over the built patterns with overlapping matches func (ac AhoCorasick) IterOverlappingByte(haystack []byte) Iter { if ac.matchKind != StandardMatch { panic("only StandardMatch allowed for overlapping matches") } i := newOverlappingIter(ac, haystack) return &i } var pool = sync.Pool{ New: func() any { return strings.Builder{} }, } type Replacer struct { finder Finder } //goland:noinspection GoUnusedExportedFunction func NewReplacer(finder Finder) Replacer { return Replacer{finder: finder} } // ReplaceAllFunc replaces the matches found in the haystack according to the user provided function // it gives fine-grained control over what is replaced. // A user can choose to stop the replacing process early by returning false in the lambda // In that case, everything from that point will be kept as the original haystack func (r Replacer) ReplaceAllFunc(haystack string, f func(match Match) (string, bool)) string { matches := r.finder.FindAll(haystack) if len(matches) == 0 { return haystack } replaceWith := make([]string, 0) for _, match := range matches { rw, ok := f(match) if !ok { break } replaceWith = append(replaceWith, rw) } str := pool.Get().(strings.Builder) defer func() { str.Reset() pool.Put(str) }() start := 0 for i, match := range matches { if i >= len(replaceWith) { str.WriteString(haystack[start:]) return str.String() } str.WriteString(haystack[start:match.Start()]) str.WriteString(replaceWith[i]) start = match.Start() + match.len } if start-1 < len(haystack) { str.WriteString(haystack[start:]) } return str.String() } // ReplaceAll replaces the matches found in the haystack according to the user provided slice `replaceWith` // It panics, if `replaceWith` has length different from the patterns that it was built with func (r Replacer) ReplaceAll(haystack string, replaceWith []string) string { if len(replaceWith) != r.finder.PatternCount() { panic("replaceWith needs to have the same length as the pattern count") } return r.ReplaceAllFunc(haystack, func(match Match) (string, bool) { return replaceWith[match.pattern], true }) } type Finder interface { FindAll(haystack string) []Match PatternCount() int } // FindAll returns the matches found in the haystack func (ac AhoCorasick) FindAll(haystack string) []Match { iter := ac.Iter(haystack) matches := make([]Match, 0) for { next := iter.Next() if next == nil { break } matches = append(matches, *next) } return matches } // AhoCorasickBuilder defines a set of options applied before the patterns are built type AhoCorasickBuilder struct { dfaBuilder *iDFABuilder nfaBuilder *iNFABuilder dfa bool matchOnlyWholeWords bool } // Opts defines a set of options applied before the patterns are built type Opts struct { AsciiCaseInsensitive bool MatchOnlyWholeWords bool MatchKind matchKind DFA bool } // NewAhoCorasickBuilder creates a new AhoCorasickBuilder based on Opts //goland:noinspection GoUnusedExportedFunction func NewAhoCorasickBuilder(o Opts) AhoCorasickBuilder { return AhoCorasickBuilder{ dfaBuilder: newDFABuilder(), nfaBuilder: newNFABuilder(o.MatchKind, o.AsciiCaseInsensitive), dfa: o.DFA, matchOnlyWholeWords: o.MatchOnlyWholeWords, } } // Build builds a (non)deterministic finite automata from the user provided patterns func (a *AhoCorasickBuilder) Build(patterns []string) AhoCorasick { bytePatterns := make([][]byte, len(patterns)) for i, pat := range patterns { bytePatterns[i] = []byte(pat) } return a.BuildByte(bytePatterns) } // BuildByte builds a (non)deterministic finite automata from the user provided patterns func (a *AhoCorasickBuilder) BuildByte(patterns [][]byte) AhoCorasick { nfa := a.nfaBuilder.build(patterns) kind := nfa.matchKind if a.dfa { dfa := a.dfaBuilder.build(nfa) return AhoCorasick{dfa, kind, a.matchOnlyWholeWords} } return AhoCorasick{nfa, kind, a.matchOnlyWholeWords} } type imp interface { MatchKind() *matchKind StartState() stateID MaxPatternLen() int PatternCount() int Prefilter() prefilter UsePrefilter() bool OverlappingFindAt(prestate *prefilterState, haystack []byte, at int, stateId *stateID, matchIndex *int) *Match EarliestFindAt(prestate *prefilterState, haystack []byte, at int, stateId *stateID) *Match FindAtNoState(prestate *prefilterState, haystack []byte, at int) *Match } type matchKind int const ( // StandardMatch Use standard match semantics, which support overlapping matches. When // used with non-overlapping matches, matches are reported as they are seen. StandardMatch matchKind = iota // LeftMostFirstMatch Use leftmost-first match semantics, which reports leftmost matches. // When there are multiple possible leftmost matches, the match // corresponding to the pattern that appeared earlier when constructing // the automaton is reported. // This does **not** support overlapping matches or stream searching LeftMostFirstMatch // LeftMostLongestMatch Use leftmost-longest match semantics, which reports leftmost matches. // When there are multiple possible leftmost matches, the longest match is chosen. LeftMostLongestMatch ) func (m matchKind) supportsOverlapping() bool { return m.isStandard() } func (m matchKind) supportsStream() bool { return m.isStandard() } func (m matchKind) isStandard() bool { return m == StandardMatch } func (m matchKind) isLeftmost() bool { return m == LeftMostFirstMatch || m == LeftMostLongestMatch } func (m matchKind) isLeftmostFirst() bool { return m == LeftMostFirstMatch } // Match A representation of a match reported by an Aho-Corasick automaton. // // A match has two essential pieces of information: the identifier of the // pattern that matched, along with the start and end offsets of the match // in the haystack. type Match struct { pattern int len int end int } // Pattern returns the index of the pattern in the slice of the patterns provided by the user that // was matched func (m *Match) Pattern() int { return m.pattern } // End gives the index of the last character of this match inside the haystack func (m *Match) End() int { return m.end } // Start gives the index of the first character of this match inside the haystack func (m *Match) Start() int { return m.end - m.len } type stateID uint const ( failedStateID stateID = 0 deadStateID stateID = 1 )