ntool/nstr/ac/nfa.go

package ac

import (
	"sort"
	"unsafe"
)

type iNFA struct {
	matchKind     matchKind
	startID       stateID
	maxPatternLen int
	patternCount  int
	heapBytes     int
	prefilter     prefilter
	anchored      bool
	byteClasses   byteClasses
	states        []state
}

func (n *iNFA) FindAtNoState(prefilterState *prefilterState, bytes []byte, i int) *Match {
	return findAtNoState(n, prefilterState, bytes, i)
}

func (n *iNFA) Repr() *iRepr {
	return nil
}

func (n *iNFA) MatchKind() *matchKind {
	return &n.matchKind
}

func (n *iNFA) Anchored() bool {
	return n.anchored
}

func (n *iNFA) Prefilter() prefilter {
	return n.prefilter
}

func (n *iNFA) StartState() stateID {
	return n.startID
}

func (n *iNFA) IsValid(id stateID) bool {
	return int(id) < len(n.states)
}

func (n *iNFA) IsMatchState(id stateID) bool {
	return n.state(id).isMatch()
}

func (n *iNFA) IsMatchOrDeadState(id stateID) bool {
	return isMatchOrDeadState(n, id)
}

func (n *iNFA) MatchCount(id stateID) int {
	return len(n.states[id].matches)
}

func (n *iNFA) NextState(id stateID, b byte) stateID {
	for {
		state := n.states[id]
		next := state.nextState(b)
		if next != failedStateID {
			return next
		}
		id = state.fail
	}
}

func (n *iNFA) NextStateNoFail(id stateID, b byte) stateID {
	next := n.NextState(id, b)
	if next == failedStateID {
		panic("automaton should never return fail_id for next state")
	}
	return next
}

func (n *iNFA) StandardFindAt(prefilterState *prefilterState, bytes []byte, i int, id *stateID) *Match {
	return standardFindAt(n, prefilterState, bytes, i, id)
}

func (n *iNFA) StandardFindAtImp(prefilterState *prefilterState, prefilter prefilter, bytes []byte, i int, id *stateID) *Match {
	return standardFindAtImp(n, prefilterState, prefilter, bytes, i, id)
}

func (n *iNFA) LeftmostFindAt(prefilterState *prefilterState, bytes []byte, i int, id *stateID) *Match {
	return leftmostFindAt(n, prefilterState, bytes, i, id)
}

func (n *iNFA) LeftmostFindAtImp(prefilterState *prefilterState, prefilter prefilter, bytes []byte, i int, id *stateID) *Match {
	return leftmostFindAtImp(n, prefilterState, prefilter, bytes, i, id)
}

func (n *iNFA) LeftmostFindAtNoState(prefilterState *prefilterState, bytes []byte, i int) *Match {
	return leftmostFindAtNoState(n, prefilterState, bytes, i)
}

func (n *iNFA) LeftmostFindAtNoStateImp(prefilterState *prefilterState, prefilter prefilter, bytes []byte, i int) *Match {
	return leftmostFindAtNoStateImp(n, prefilterState, prefilter, bytes, i)
}

func (n *iNFA) OverlappingFindAt(prefilterState *prefilterState, bytes []byte, i int, id *stateID, i2 *int) *Match {
	return overlappingFindAt(n, prefilterState, bytes, i, id, i2)
}

func (n *iNFA) EarliestFindAt(prefilterState *prefilterState, bytes []byte, i int, id *stateID) *Match {
	return earliestFindAt(n, prefilterState, bytes, i, id)
}

func (n *iNFA) FindAt(prefilterState *prefilterState, bytes []byte, i int, id *stateID) *Match {
	return findAt(n, prefilterState, bytes, i, id)
}

func (n *iNFA) MaxPatternLen() int {
	return n.maxPatternLen
}

func (n *iNFA) PatternCount() int {
	return n.patternCount
}

func (n *iNFA) UsePrefilter() bool {
	p := n.Prefilter()
	if p == nil {
		return false
	}
	return !p.LooksForNonStartOfMatch()
}

func (n *iNFA) GetMatch(id stateID, matchIndex int, end int) *Match {
	if int(id) >= len(n.states) {
		return nil
	}
	state := n.states[id]
	if matchIndex >= len(state.matches) {
		return nil
	}
	pat := state.matches[matchIndex]
	return &Match{
		pattern: pat.PatternID,
		len:     pat.PatternLength,
		end:     end,
	}
}

func (n *iNFA) addDenseState(depth int) stateID {
	d := newDense()
	trans := transitions{dense: &d}
	id := stateID(len(n.states))

	fail := n.startID

	if n.anchored {
		fail = deadStateID
	}

	n.states = append(n.states, state{
		trans:   trans,
		fail:    fail,
		matches: nil,
		depth:   depth,
	})
	return id
}

func (n *iNFA) addSparseState(depth int) stateID {
	trans := transitions{sparse: &sparse{inner: nil}}
	id := stateID(len(n.states))

	fail := n.startID

	if n.anchored {
		fail = deadStateID
	}

	n.states = append(n.states, state{
		trans:   trans,
		fail:    fail,
		matches: nil,
		depth:   depth,
	})
	return id
}

func (n *iNFA) state(id stateID) *state {
	return &n.states[int(id)]
}

type compiler struct {
	builder          iNFABuilder
	prefilter        prefilterBuilder
	nfa              iNFA
	byteClassBuilder byteClassBuilder
}

func (c *compiler) compile(patterns [][]byte) *iNFA {
	c.addState(0)
	c.addState(0)
	c.addState(0)

	c.buildTrie(patterns)

	c.addStartStateLoop()
	c.addDeadStateLoop()

	if !c.builder.anchored {
		if c.builder.matchKind.isLeftmost() {
			c.fillFailureTransitionsLeftmost()
		} else {
			c.fillFailureTransitionsStandard()
		}
	}
	c.closeStartStateLoop()

	c.nfa.byteClasses = c.byteClassBuilder.build()
	if !c.builder.anchored {
		c.nfa.prefilter = c.prefilter.build()
	}
	c.calculateSize()

	return &c.nfa
}

func (c *compiler) calculateSize() {
	var size int
	for _, state := range c.nfa.states {
		size += state.heapBytes()
	}

	c.nfa.heapBytes = size
}

func (c *compiler) closeStartStateLoop() {
	if c.builder.anchored || (c.builder.matchKind.isLeftmost() && c.nfa.state(c.nfa.startID).isMatch()) {
		startId := c.nfa.startID
		start := c.nfa.state(startId)

		for b := 0; b < 256; b++ {
			if start.nextState(byte(b)) == startId {
				start.setNextState(byte(b), deadStateID)
			}
		}
	}
}

type queuedState struct {
	id           stateID
	matchAtDepth *int
}

func startQueuedState(nfa *iNFA) queuedState {
	var matchAtDepth *int
	if nfa.states[nfa.startID].isMatch() {
		r := 0
		matchAtDepth = &r
	}
	return queuedState{id: nfa.startID, matchAtDepth: matchAtDepth}
}

func (q *queuedState) nextQueuedState(nfa *iNFA, id stateID) queuedState {
	nextMatchAtDepth := q.nextMatchAtDepth(nfa, id)
	return queuedState{id, nextMatchAtDepth}
}

func (q *queuedState) nextMatchAtDepth(
	nfa *iNFA,
	next stateID,
) *int {
	switch q.matchAtDepth {
	case nil:
		if !nfa.state(next).isMatch() {
			return nil
		}
	default:
		return q.matchAtDepth
	}

	depth := nfa.state(next).depth - *nfa.state(next).getLongestMatch() + 1
	return &depth
}

func (c *compiler) fillFailureTransitionsStandard() {
	queue := make([]stateID, 0)
	seen := c.queuedSet()

	for b := 0; b < 256; b++ {
		next := c.nfa.state(c.nfa.startID).nextState(byte(b))
		if next != c.nfa.startID {
			if !seen.contains(next) {
				queue = append(queue, next)
				seen.insert(next)
			}
		}
	}

	for len(queue) > 0 {
		id := queue[0]
		queue = queue[1:]
		it := newIterTransitions(&c.nfa, id)

		for next := it.next(); next != nil; next = it.next() {
			if seen.contains(next.id) {
				continue
			}
			queue = append(queue, next.id)
			seen.insert(next.id)

			fail := it.nfa.state(id).fail
			for it.nfa.state(fail).nextState(next.key) == failedStateID {
				fail = it.nfa.state(fail).fail
			}
			fail = it.nfa.state(fail).nextState(next.key)
			it.nfa.state(next.id).fail = fail
			it.nfa.copyMatches(fail, next.id)
		}
		it.nfa.copyEmptyMatches(id)
	}
}

func (c *compiler) fillFailureTransitionsLeftmost() {
	queue := make([]queuedState, 0)
	seen := c.queuedSet()
	start := startQueuedState(&c.nfa)

	for b := 0; b < 256; b++ {
		nextId := c.nfa.state(c.nfa.startID).nextState(byte(b))
		if nextId != start.id {
			next := start.nextQueuedState(&c.nfa, nextId)
			if !seen.contains(next.id) {
				queue = append(queue, next)
				seen.insert(next.id)
			}
			if c.nfa.state(nextId).isMatch() {
				c.nfa.state(nextId).fail = deadStateID
			}
		}
	}

	for len(queue) > 0 {
		item := queue[0]
		queue = queue[1:]
		anyTrans := false
		it := newIterTransitions(&c.nfa, item.id)
		tr := it.next()
		for tr != nil {
			anyTrans = true
			next := item.nextQueuedState(it.nfa, tr.id)
			if seen.contains(next.id) {
				tr = it.next()
				continue
			}
			queue = append(queue, next)
			seen.insert(next.id)

			fail := it.nfa.state(item.id).fail
			for it.nfa.state(fail).nextState(tr.key) == failedStateID {
				fail = it.nfa.state(fail).fail
			}
			fail = it.nfa.state(fail).nextState(tr.key)

			if next.matchAtDepth != nil {
				failDepth := it.nfa.state(fail).depth
				nextDepth := it.nfa.state(next.id).depth
				if nextDepth-*next.matchAtDepth+1 > failDepth {
					it.nfa.state(next.id).fail = deadStateID
					tr = it.next()
					continue
				}

				if start.id == it.nfa.state(next.id).fail {
					panic("states that are match states or follow match states should never have a failure transition back to the start state in leftmost searching")
				}
			}
			it.nfa.state(next.id).fail = fail
			it.nfa.copyMatches(fail, next.id)
			tr = it.next()
		}
		if !anyTrans && it.nfa.state(item.id).isMatch() {
			it.nfa.state(item.id).fail = deadStateID
		}
	}
}

func (n *iNFA) copyEmptyMatches(dst stateID) {
	n.copyMatches(n.startID, dst)
}

func (n *iNFA) copyMatches(src stateID, dst stateID) {
	srcState, dstState := n.getTwo(src, dst)
	dstState.matches = append(dstState.matches, srcState.matches...)
}

func (n *iNFA) getTwo(i stateID, j stateID) (*state, *state) {
	if i == j {
		panic("src and dst should not be equal")
	}

	if i < j {
		before, after := n.states[0:j], n.states[j:]
		return &before[i], &after[0]
	}

	before, after := n.states[0:i], n.states[i:]
	return &after[0], &before[j]
}

func (n *iNFA) iterAllTransitions(byteClasses *byteClasses, id stateID, f func(tr *next)) {
	n.states[id].trans.iterAll(byteClasses, f)
}

func newIterTransitions(nfa *iNFA, stateId stateID) iterTransitions {
	return iterTransitions{
		nfa:     nfa,
		stateId: stateId,
		cur:     0,
	}
}

type iterTransitions struct {
	nfa     *iNFA
	stateId stateID
	cur     int
}

type next struct {
	key byte
	id  stateID
}

func (i *iterTransitions) next() *next {
	sparse := i.nfa.states[int(i.stateId)].trans.sparse
	if sparse != nil {
		if i.cur >= len(sparse.inner) {
			return nil
		}
		ii := i.cur
		i.cur += 1
		return &next{
			key: sparse.inner[ii].b,
			id:  sparse.inner[ii].s,
		}
	}

	dense := i.nfa.states[int(i.stateId)].trans.dense
	for i.cur < len(dense.inner) {
		if i.cur >= 256 {
			panic("There are always exactly 255 transitions in dense repr")
		}

		b := byte(i.cur)
		id := dense.inner[b]
		i.cur += 1
		if id != failedStateID {
			return &next{
				key: b,
				id:  id,
			}
		}
	}
	return nil
}

type queuedSet struct {
	set map[stateID]struct{}
	ind int
}

func newInertQueuedSet() queuedSet {
	return queuedSet{
		set: make(map[stateID]struct{}),
		ind: 0,
	}
}

func (q *queuedSet) contains(s stateID) bool {
	_, ok := q.set[s]
	return ok
}

func (q *queuedSet) insert(s stateID) {
	q.set[s] = struct{}{}
}

func newActiveQueuedSet() queuedSet {
	return queuedSet{
		set: make(map[stateID]struct{}, 0),
		ind: 0,
	}
}

func (c *compiler) queuedSet() queuedSet {
	if c.builder.asciiCaseInsensitive {
		return newActiveQueuedSet()
	}
	return newInertQueuedSet()
}

func (c *compiler) addStartStateLoop() {
	startId := c.nfa.startID
	start := c.nfa.state(startId)
	for b := 0; b < 256; b++ {
		if start.nextState(byte(b)) == failedStateID {
			start.setNextState(byte(b), startId)
		}
	}
}

func (c *compiler) addDeadStateLoop() {
	dead := c.nfa.state(deadStateID)
	for b := 0; b < 256; b++ {
		dead.setNextState(byte(b), deadStateID)
	}
}

func (c *compiler) buildTrie(patterns [][]byte) {
Patterns:
	for i, pat := range patterns {
		c.nfa.maxPatternLen = max(c.nfa.maxPatternLen, len(pat))
		c.nfa.patternCount += 1

		prev := c.nfa.startID
		sawMatch := false

		for depth, b := range pat {
			sawMatch = sawMatch || c.nfa.state(prev).isMatch()
			if c.builder.matchKind.isLeftmostFirst() && sawMatch {
				continue Patterns
			}

			c.byteClassBuilder.setRange(b, b)

			if c.builder.asciiCaseInsensitive {
				b := oppositeAsciiCase(b)
				c.byteClassBuilder.setRange(b, b)
			}

			next := c.nfa.state(prev).nextState(b)

			if next != failedStateID {
				prev = next
			} else {
				next := c.addState(depth + 1)
				c.nfa.state(prev).setNextState(b, next)
				if c.builder.asciiCaseInsensitive {
					b := oppositeAsciiCase(b)
					c.nfa.state(prev).setNextState(b, next)
				}
				prev = next
			}
		}
		c.nfa.state(prev).addMatch(i, len(pat))

		if c.builder.prefilter {
			c.prefilter.add(pat)
		}
	}
}

const asciiCaseMask byte = 0b0010_0000

func toAsciiLowercase(b byte) byte {
	return b | (1 * asciiCaseMask)
}

func toAsciiUpper(b byte) byte {
	b &= ^(1 * asciiCaseMask)
	return b
}

func oppositeAsciiCase(b byte) byte {
	if 'A' <= b && b <= 'Z' {
		return toAsciiLowercase(b)
	} else if 'a' <= b && b <= 'z' {
		return toAsciiUpper(b)
	}
	return b
}

func (c *compiler) addState(depth int) stateID {
	if depth < c.builder.denseDepth {
		return c.nfa.addDenseState(depth)

	}
	return c.nfa.addSparseState(depth)
}

func newCompiler(builder iNFABuilder) compiler {
	p := newPrefilterBuilder(builder.asciiCaseInsensitive)

	return compiler{
		builder:   builder,
		prefilter: p,
		nfa: iNFA{
			matchKind:     builder.matchKind,
			startID:       2,
			maxPatternLen: 0,
			patternCount:  0,
			heapBytes:     0,
			prefilter:     nil,
			anchored:      builder.anchored,
			byteClasses:   singletons(),
			states:        nil,
		},
		byteClassBuilder: newByteClassBuilder(),
	}
}

type iNFABuilder struct {
	denseDepth           int
	matchKind            matchKind
	prefilter            bool
	anchored             bool
	asciiCaseInsensitive bool
}

func newNFABuilder(kind matchKind, asciiCaseInsensitive bool) *iNFABuilder {
	return &iNFABuilder{
		denseDepth:           2,
		matchKind:            kind,
		prefilter:            true,
		anchored:             false,
		asciiCaseInsensitive: asciiCaseInsensitive,
	}
}

func (b *iNFABuilder) build(patterns [][]byte) *iNFA {
	c := newCompiler(*b)
	return c.compile(patterns)
}

type state struct {
	trans   transitions
	fail    stateID
	matches []pattern
	depth   int
}

func (s *state) heapBytes() int {
	var i int
	intSize := int(unsafe.Sizeof(i))
	return s.trans.heapBytes() + (len(s.matches) * (intSize * 2))
}

func (s *state) addMatch(patternID, patternLength int) {
	s.matches = append(s.matches, pattern{
		PatternID:     patternID,
		PatternLength: patternLength,
	})
}

func (s *state) isMatch() bool {
	return len(s.matches) > 0
}

func (s *state) getLongestMatch() *int {
	if len(s.matches) == 0 {
		return nil
	}
	longest := s.matches[0].PatternLength
	return &longest
}

func (s *state) nextState(input byte) stateID {
	return s.trans.nextState(input)
}

func (s *state) setNextState(input byte, next stateID) {
	s.trans.setNextState(input, next)
}

type transitions struct {
	sparse *sparse
	dense  *dense
}

func sparseIter(trans []innerSparse, f func(*next)) {
	var byte16 uint16

	for _, tr := range trans {
		for byte16 < uint16(tr.b) {
			f(&next{
				key: byte(byte16),
				id:  failedStateID,
			})
			byte16 += 1
		}
		f(&next{
			key: tr.b,
			id:  tr.s,
		})
		byte16 += 1
	}

	for b := byte16; b < 256; b++ {
		f(&next{
			key: byte(b),
			id:  failedStateID,
		})
	}
}

func (t *transitions) iterAll(byteClasses *byteClasses, f func(tr *next)) {
	if byteClasses.isSingleton() {
		if t.sparse != nil {
			sparseIter(t.sparse.inner, f)
		}

		if t.dense != nil {
			for b := 0; b < 256; b++ {
				f(&next{
					key: byte(b),
					id:  t.dense.inner[b],
				})
			}
		}
	} else {
		if t.sparse != nil {
			var lastClass *byte

			sparseIter(t.sparse.inner, func(n *next) {
				class := byteClasses.bytes[n.key]

				if lastClass == nil || *lastClass != class {
					cc := class
					lastClass = &cc
					f(n)
				}
			})
		}

		if t.dense != nil {
			bcr := byteClassRepresentatives{
				classes:   byteClasses,
				bbyte:     0,
				lastClass: nil,
			}

			for n := bcr.next(); n != nil; n = bcr.next() {
				f(&next{
					key: *n,
					id:  t.dense.inner[*n],
				})
			}
		}
	}

}

func (t *transitions) heapBytes() int {
	var i int
	intSize := int(unsafe.Sizeof(i))
	if t.sparse != nil {
		return len(t.sparse.inner) * (2 * intSize)
	}
	return len(t.dense.inner) * intSize
}

func (t *transitions) nextState(input byte) stateID {
	if t.sparse != nil {
		for _, sp := range t.sparse.inner {
			if sp.b == input {
				return sp.s
			}
		}
		return failedStateID
	}
	return t.dense.inner[input]
}

func (t *transitions) setNextState(input byte, next stateID) {
	if t.sparse != nil {
		idx := sort.Search(len(t.sparse.inner), func(i int) bool {
			return t.sparse.inner[i].b >= input
		})

		if idx < len(t.sparse.inner) && t.sparse.inner[idx].b == input {
			t.sparse.inner[idx].s = next
		} else {
			if len(t.sparse.inner) > 0 {
				is := innerSparse{
					b: input,
					s: next,
				}
				if idx == len(t.sparse.inner) {
					t.sparse.inner = append(t.sparse.inner, is)
				} else {
					t.sparse.inner = append(
						t.sparse.inner[:idx+1],
						t.sparse.inner[idx:]...)
					t.sparse.inner[idx] = is
				}
			} else {
				t.sparse.inner = []innerSparse{
					{
						b: input,
						s: next,
					},
				}
			}
		}
		return
	}
	t.dense.inner[int(input)] = next
}

func newDense() dense {
	return dense{inner: make([]stateID, 256)}
}

type dense struct {
	inner []stateID
}

type innerSparse struct {
	b byte
	s stateID
}

type sparse struct {
	inner []innerSparse
}