Merge pull request #15666 from spray/w/15665-fix-BoyerMoore
=hco #15665 fix BoyerMoore for byte values > 0x7f
This commit is contained in:
commit
2c8b8af8e0
2 changed files with 34 additions and 20 deletions
|
|
@ -19,7 +19,7 @@ private class BoyerMoore(needle: Array[Byte]) {
|
|||
val table = Array.fill(256)(needle.length)
|
||||
@tailrec def rec(i: Int): Unit =
|
||||
if (i < nl1) {
|
||||
table(needle(i)) = nl1 - i
|
||||
table(needle(i) & 0xff) = nl1 - i
|
||||
rec(i + 1)
|
||||
}
|
||||
rec(0)
|
||||
|
|
@ -61,7 +61,7 @@ private class BoyerMoore(needle: Array[Byte]) {
|
|||
if (needle(j) == byte) {
|
||||
if (j == 0) i // found
|
||||
else rec(i - 1, j - 1)
|
||||
} else rec(i + math.max(offsetTable(nl1 - j), charTable(byte)), nl1)
|
||||
} else rec(i + math.max(offsetTable(nl1 - j), charTable(byte & 0xff)), nl1)
|
||||
}
|
||||
rec(offset + nl1, nl1)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@
|
|||
|
||||
package akka.http.parsing
|
||||
|
||||
import java.util.regex.Pattern
|
||||
|
||||
import akka.parboiled2.CharPredicate
|
||||
|
||||
import scala.annotation.tailrec
|
||||
|
|
@ -18,33 +20,42 @@ class BoyerMooreSpec extends WordSpec with Matchers {
|
|||
"The Boyer Moore implementation" should {
|
||||
|
||||
"correctly find all matches of a few handwritten string-search examples" in {
|
||||
find("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21)
|
||||
find("ana", "bananas") shouldEqual Seq(1, 3)
|
||||
find("anna", "bananas") shouldEqual Seq()
|
||||
findString("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21)
|
||||
findString("ana", "bananas") shouldEqual Seq(1, 3)
|
||||
findString("anna", "bananas") shouldEqual Seq()
|
||||
}
|
||||
|
||||
"perform identically to a regex search" in {
|
||||
val random = new Random()
|
||||
// the alphabet base is a random shuffle of 8 distinct alphanumeric chars
|
||||
val alphabetBase = random.shuffle('0' to 'z' filter CharPredicate.AlphaNum) take 8
|
||||
val alphabetBase: IndexedSeq[Byte] = random.shuffle(0 to 255).take(8).map(_.toByte)
|
||||
|
||||
val haystackLen = 1000
|
||||
(0 to 9) foreach { run ⇒
|
||||
val alphabet = alphabetBase.take(4 + random.nextInt(5)).mkString // 4 to 8 distinct alphanumeric chars
|
||||
val randomAlphabetChars = Stream.continually(alphabet.charAt(random.nextInt(alphabet.length)))
|
||||
val haystack = randomAlphabetChars.take(haystackLen).mkString
|
||||
val needle = randomAlphabetChars.take(run / 3 + 3).mkString // 3 to 6 random alphabet chars
|
||||
val alphabet = alphabetBase.take(4 + random.nextInt(5)) // 4 to 8 distinct alphanumeric chars
|
||||
val randomAlphabetChars = Stream.continually(alphabet(random.nextInt(alphabet.length)))
|
||||
def randomBytes(num: Int): ByteString = ByteString(randomAlphabetChars.take(num): _*)
|
||||
val haystack = randomBytes(haystackLen)
|
||||
val needle = randomBytes(run / 3 + 3) // 3 to 6 random alphabet chars
|
||||
|
||||
val bmFinds = find(needle, haystack, skipFindsThatStartInFinds = true)
|
||||
val reFinds = findWithRegex(needle, haystack)
|
||||
if (bmFinds != reFinds) {
|
||||
def showFind(ix: Int): String =
|
||||
s"""...${haystack.substring(math.max(ix - 8, 0), math.min(ix + needle.length + 8, haystack.length))}...
|
||||
|${" " * (3 + math.min(ix - 8, 8))}${"^" * needle.length}
|
||||
def showBytes(bs: Seq[Byte]): String = bs.map(b ⇒ (b & 0xff).formatted("%02x")).mkString(" ")
|
||||
def len(num: Int) = num * 2 + math.max(0, num - 1)
|
||||
|
||||
def showFind(ix: Int): String = {
|
||||
val startIdx = math.max(ix - 8, 0)
|
||||
val endIdx = math.min(ix + needle.length + 8, haystack.length)
|
||||
|
||||
s"""...${showBytes(haystack.drop(startIdx).take(endIdx - startIdx))}...
|
||||
|${" " * (3 + math.min(8, ix) * 3)}${"^" * len(needle.length)}
|
||||
|""".stripMargin
|
||||
}
|
||||
val foundOnlyByBM = bmFinds.filterNot(reFinds.contains).map(showFind).mkString
|
||||
val foundOnlyByRE = reFinds.filterNot(bmFinds.contains).map(showFind).mkString
|
||||
fail(s"""alphabet: $alphabet
|
||||
|needle: $needle
|
||||
fail(s"""alphabet: ${showBytes(alphabet)}
|
||||
|needle: ${showBytes(needle)}
|
||||
|found only by boyer moore:
|
||||
|$foundOnlyByBM
|
||||
|found only by regex:
|
||||
|
|
@ -55,17 +66,20 @@ class BoyerMooreSpec extends WordSpec with Matchers {
|
|||
}
|
||||
}
|
||||
|
||||
def find(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = {
|
||||
val boyerMoore = new BoyerMoore(needle.asciiBytes)
|
||||
def findString(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] =
|
||||
find(ByteString(needle), ByteString(haystack), skipFindsThatStartInFinds)
|
||||
|
||||
def find(needle: ByteString, haystack: ByteString, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = {
|
||||
val boyerMoore = new BoyerMoore(needle.toArray[Byte])
|
||||
@tailrec def rec(offset: Int, result: Seq[Int]): Seq[Int] = {
|
||||
val ix =
|
||||
try boyerMoore.nextIndex(ByteString(haystack), offset)
|
||||
try boyerMoore.nextIndex(haystack, offset)
|
||||
catch { case NotEnoughDataException ⇒ -1 }
|
||||
if (ix >= 0) rec(if (skipFindsThatStartInFinds) ix + needle.length else ix + 1, result :+ ix) else result
|
||||
}
|
||||
rec(0, Seq.empty)
|
||||
}
|
||||
|
||||
def findWithRegex(needle: String, haystack: String): Seq[Int] =
|
||||
needle.r.findAllMatchIn(haystack).map(_.start).toSeq
|
||||
def findWithRegex(needle: ByteString, haystack: ByteString): Seq[Int] =
|
||||
Pattern.quote(needle.map(_.toChar).mkString).r.findAllMatchIn(haystack.map(_.toChar).mkString).map(_.start).toSeq
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue