You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ntool/nstr/ac/prefilter.go

602 lines
11 KiB
Go

1 year ago
package ac
import (
"math"
)
type startBytesThree struct {
byte1 byte
byte2 byte
byte3 byte
}
func (s *startBytesThree) NextCandidate(_ *prefilterState, haystack []byte, at int) (any, candidateType) {
1 year ago
for i, b := range haystack[at:] {
if s.byte1 == b || s.byte2 == b || s.byte3 == b {
return at + i, possibleStartOfMatchCandidate
}
}
return nil, noneCandidate
}
func (s *startBytesThree) HeapBytes() int {
return 0
}
func (s *startBytesThree) ReportsFalsePositives() bool {
return true
}
func (s *startBytesThree) LooksForNonStartOfMatch() bool {
return false
}
func (s *startBytesThree) clone() prefilter {
if s == nil {
return nil
}
u := *s
return &u
}
type startBytesTwo struct {
byte1 byte
byte2 byte
}
func (s *startBytesTwo) NextCandidate(_ *prefilterState, haystack []byte, at int) (any, candidateType) {
1 year ago
for i, b := range haystack[at:] {
if s.byte1 == b || s.byte2 == b {
return at + i, possibleStartOfMatchCandidate
}
}
return nil, noneCandidate
}
func (s *startBytesTwo) HeapBytes() int {
return 0
}
func (s *startBytesTwo) ReportsFalsePositives() bool {
return true
}
func (s *startBytesTwo) LooksForNonStartOfMatch() bool {
return false
}
func (s *startBytesTwo) clone() prefilter {
if s == nil {
return nil
}
u := *s
return &u
}
type startBytesOne struct {
byte1 byte
}
func (s *startBytesOne) NextCandidate(_ *prefilterState, haystack []byte, at int) (any, candidateType) {
1 year ago
for i, b := range haystack[at:] {
if s.byte1 == b {
return at + i, possibleStartOfMatchCandidate
}
}
return nil, noneCandidate
}
func (s *startBytesOne) HeapBytes() int {
return 0
}
func (s *startBytesOne) ReportsFalsePositives() bool {
return true
}
func (s *startBytesOne) LooksForNonStartOfMatch() bool {
return false
}
func (s *startBytesOne) clone() prefilter {
if s == nil {
return nil
}
u := *s
return &u
}
type byteSet [256]bool
func (b *byteSet) contains(bb byte) bool {
return b[int(bb)]
}
func (b *byteSet) insert(bb byte) bool {
n := !b.contains(bb)
b[int(bb)] = true
return n
}
type rareByteOffset struct {
max byte
}
type rareByteOffsets struct {
rbo [256]rareByteOffset
}
func (r *rareByteOffsets) set(b byte, off rareByteOffset) {
m := byte(max(int(r.rbo[int(b)].max), int(off.max)))
r.rbo[int(b)].max = m
}
type prefilterBuilder struct {
count int
asciiCaseInsensitive bool
startBytes startBytesBuilder
rareBytes rareBytesBuilder
}
func (p *prefilterBuilder) build() prefilter {
startBytes := p.startBytes.build()
rareBytes := p.rareBytes.build()
switch true {
case startBytes != nil && rareBytes != nil:
hasFewerBytes := p.startBytes.count < p.rareBytes.count
hasRarerBytes := p.startBytes.rankSum <= p.rareBytes.rankSum+50
if hasFewerBytes || hasRarerBytes {
return startBytes
} else {
return rareBytes
}
case startBytes != nil:
return startBytes
case rareBytes != nil:
return rareBytes
case p.asciiCaseInsensitive:
return nil
default:
return nil
}
}
func (p *prefilterBuilder) add(bytes []byte) {
p.count += 1
p.startBytes.add(bytes)
p.rareBytes.add(bytes)
}
func newPrefilterBuilder(asciiCaseInsensitive bool) prefilterBuilder {
return prefilterBuilder{
count: 0,
asciiCaseInsensitive: asciiCaseInsensitive,
startBytes: newStartBytesBuilder(asciiCaseInsensitive),
rareBytes: newRareBytesBuilder(asciiCaseInsensitive),
}
}
type rareBytesBuilder struct {
asciiCaseInsensitive bool
rareSet byteSet
byteOffsets rareByteOffsets
available bool
count int
rankSum uint16
}
type rareBytesOne struct {
byte1 byte
offset rareByteOffset
}
func (r *rareBytesOne) NextCandidate(state *prefilterState, haystack []byte, at int) (any, candidateType) {
1 year ago
for i, b := range haystack[at:] {
if r.byte1 == b {
pos := at + i
state.lastScanAt = pos
r := pos - int(r.offset.max)
if r < 0 {
r = 0
}
if at > r {
r = at
}
return r, possibleStartOfMatchCandidate
}
}
return nil, noneCandidate
}
func (r *rareBytesOne) HeapBytes() int {
return 0
}
func (r *rareBytesOne) ReportsFalsePositives() bool {
return true
}
func (r *rareBytesOne) LooksForNonStartOfMatch() bool {
return true
}
func (r *rareBytesOne) clone() prefilter {
if r == nil {
return nil
}
u := *r
return &u
}
type rareBytesTwo struct {
offsets rareByteOffsets
byte1 byte
byte2 byte
}
func (r *rareBytesTwo) NextCandidate(state *prefilterState, haystack []byte, at int) (any, candidateType) {
1 year ago
for i, b := range haystack[at:] {
if r.byte1 == b || r.byte2 == b {
pos := at + i
state.updateAt(pos)
r := pos - int(r.offsets.rbo[haystack[pos]].max)
if r < 0 {
r = 0
}
if at > r {
r = at
}
return r, possibleStartOfMatchCandidate
}
}
return nil, noneCandidate
}
func (r *rareBytesTwo) HeapBytes() int {
return 0
}
func (r *rareBytesTwo) ReportsFalsePositives() bool {
return true
}
func (r *rareBytesTwo) LooksForNonStartOfMatch() bool {
return true
}
func (r *rareBytesTwo) clone() prefilter {
if r == nil {
return nil
}
u := *r
return &u
}
type rareBytesThree struct {
offsets rareByteOffsets
byte1 byte
byte2 byte
byte3 byte
}
func (r *rareBytesThree) NextCandidate(state *prefilterState, haystack []byte, at int) (any, candidateType) {
1 year ago
for i, b := range haystack[at:] {
if r.byte1 == b || r.byte2 == b || r.byte3 == b {
pos := at + i
state.updateAt(pos)
r := pos - int(r.offsets.rbo[haystack[pos]].max)
if r < 0 {
r = 0
}
if at > r {
r = at
}
return r, possibleStartOfMatchCandidate
}
}
return nil, noneCandidate
}
func (r *rareBytesThree) HeapBytes() int {
return 0
}
func (r *rareBytesThree) ReportsFalsePositives() bool {
return true
}
func (r *rareBytesThree) LooksForNonStartOfMatch() bool {
return true
}
func (r *rareBytesThree) clone() prefilter {
if r == nil {
return nil
}
u := *r
return &u
}
func (r *rareBytesBuilder) build() prefilter {
if !r.available || r.count > 3 {
return nil
}
var length int
bytes := [3]byte{}
for b := 0; b <= 255; b++ {
if r.rareSet.contains(byte(b)) {
bytes[length] = byte(b)
length += 1
}
}
switch length {
case 0:
return nil
case 1:
return &rareBytesOne{
byte1: bytes[0],
offset: r.byteOffsets.rbo[bytes[0]],
}
case 2:
return &rareBytesTwo{
offsets: r.byteOffsets,
byte1: bytes[0],
byte2: bytes[1],
}
case 3:
return &rareBytesThree{
offsets: r.byteOffsets,
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
}
default:
return nil
}
}
func (r *rareBytesBuilder) add(bytes []byte) {
if !r.available {
return
}
if r.count > 3 {
r.available = false
return
}
if len(bytes) >= 256 {
r.available = false
return
}
if len(bytes) == 0 {
return
}
rarest1, rarest2 := bytes[0], freqRank(bytes[0])
found := false
for pos, b := range bytes {
r.setOffset(pos, b)
if found {
continue
}
if r.rareSet.contains(b) {
found = true
}
rank := freqRank(b)
if rank < rarest2 {
rarest1 = b
rarest2 = rank
}
if !found {
r.addRareByte(rarest1)
}
}
}
func (r *rareBytesBuilder) addRareByte(b byte) {
r.addOneRareByte(b)
if r.asciiCaseInsensitive {
r.addOneRareByte(oppositeAsciiCase(b))
}
}
func (r *rareBytesBuilder) addOneRareByte(b byte) {
if r.rareSet.insert(b) {
r.count += 1
r.rankSum += uint16(freqRank(b))
}
}
func newRareByteOffset(i int) rareByteOffset {
if i > math.MaxUint8 {
return rareByteOffset{max: 0}
}
b := byte(i)
return rareByteOffset{max: b}
}
func (r *rareBytesBuilder) setOffset(pos int, b byte) {
offset := newRareByteOffset(pos)
r.byteOffsets.set(b, offset)
if r.asciiCaseInsensitive {
r.byteOffsets.set(oppositeAsciiCase(b), offset)
}
}
func newRareBytesBuilder(asciiCaseInsensitive bool) rareBytesBuilder {
return rareBytesBuilder{
asciiCaseInsensitive: asciiCaseInsensitive,
rareSet: byteSet{},
byteOffsets: rareByteOffsets{},
available: true,
count: 0,
rankSum: 0,
}
}
type startBytesBuilder struct {
asciiCaseInsensitive bool
byteSet []bool
count int
rankSum uint16
}
func (s *startBytesBuilder) build() prefilter {
if s.count > 3 {
return nil
}
var length int
bytes := [3]byte{}
for b := 0; b < 256; b++ {
//todo case insensitive is not set in byteSet
if !s.byteSet[b] {
continue
}
if b > 0x7F {
return nil
}
bytes[length] = byte(b)
length += 1
}
switch length {
case 0:
return nil
case 1:
return &startBytesOne{byte1: bytes[0]}
case 2:
return &startBytesTwo{
byte1: bytes[0],
byte2: bytes[1],
}
case 3:
return &startBytesThree{
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
}
default:
return nil
}
}
func (s *startBytesBuilder) add(bytes []byte) {
if s.count > 3 || len(bytes) == 0 {
return
}
b := bytes[0]
s.addOneByte(b)
if s.asciiCaseInsensitive {
s.addOneByte(oppositeAsciiCase(b))
}
}
func (s *startBytesBuilder) addOneByte(b byte) {
if !s.byteSet[int(b)] {
s.byteSet[int(b)] = true
s.count += 1
s.rankSum += uint16(freqRank(b))
}
}
func freqRank(b byte) byte {
return byteFrequencies[int(b)]
}
func newStartBytesBuilder(asciiCaseInsensitive bool) startBytesBuilder {
return startBytesBuilder{
asciiCaseInsensitive: asciiCaseInsensitive,
byteSet: make([]bool, 256),
count: 0,
rankSum: 0,
}
}
const minSkips int = 40
const minAvgFactor int = 2
type prefilterState struct {
skips int
skipped int
maxMatchLen int
inert bool
lastScanAt int
}
func (p *prefilterState) updateAt(at int) {
if at > p.lastScanAt {
p.lastScanAt = at
}
}
func (p *prefilterState) IsEffective(at int) bool {
if p.inert || at < p.lastScanAt {
return false
}
if p.skips < minSkips {
return true
}
minAvg := minAvgFactor * p.maxMatchLen
if p.skipped >= minAvg*p.skips {
return true
}
p.inert = true
return false
}
func (p *prefilterState) updateSkippedBytes(skipped int) {
p.skips += 1
p.skipped += skipped
}
type candidateType uint
const (
noneCandidate candidateType = iota
matchCandidate
possibleStartOfMatchCandidate
)
type prefilter interface {
NextCandidate(state *prefilterState, haystack []byte, at int) (any, candidateType)
1 year ago
HeapBytes() int
ReportsFalsePositives() bool
LooksForNonStartOfMatch() bool
clone() prefilter
}
func nextPrefilter(state *prefilterState, prefilter prefilter, haystack []byte, at int) (any, candidateType) {
1 year ago
candidate, typ := prefilter.NextCandidate(state, haystack, at)
switch typ {
case noneCandidate:
state.updateSkippedBytes(len(haystack) - at)
case matchCandidate:
m := candidate.(*Match)
state.updateSkippedBytes(m.Start() - at)
case possibleStartOfMatchCandidate:
i := candidate.(int)
state.updateSkippedBytes(i - at)
}
return candidate, typ
}