Merge pull request #15666 from spray/w/15665-fix-BoyerMoore
=hco #15665 fix BoyerMoore for byte values > 0x7f
This commit is contained in:
commit
2c8b8af8e0
2 changed files with 34 additions and 20 deletions
|
|
@ -19,7 +19,7 @@ private class BoyerMoore(needle: Array[Byte]) {
|
||||||
val table = Array.fill(256)(needle.length)
|
val table = Array.fill(256)(needle.length)
|
||||||
@tailrec def rec(i: Int): Unit =
|
@tailrec def rec(i: Int): Unit =
|
||||||
if (i < nl1) {
|
if (i < nl1) {
|
||||||
table(needle(i)) = nl1 - i
|
table(needle(i) & 0xff) = nl1 - i
|
||||||
rec(i + 1)
|
rec(i + 1)
|
||||||
}
|
}
|
||||||
rec(0)
|
rec(0)
|
||||||
|
|
@ -61,7 +61,7 @@ private class BoyerMoore(needle: Array[Byte]) {
|
||||||
if (needle(j) == byte) {
|
if (needle(j) == byte) {
|
||||||
if (j == 0) i // found
|
if (j == 0) i // found
|
||||||
else rec(i - 1, j - 1)
|
else rec(i - 1, j - 1)
|
||||||
} else rec(i + math.max(offsetTable(nl1 - j), charTable(byte)), nl1)
|
} else rec(i + math.max(offsetTable(nl1 - j), charTable(byte & 0xff)), nl1)
|
||||||
}
|
}
|
||||||
rec(offset + nl1, nl1)
|
rec(offset + nl1, nl1)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,8 @@
|
||||||
|
|
||||||
package akka.http.parsing
|
package akka.http.parsing
|
||||||
|
|
||||||
|
import java.util.regex.Pattern
|
||||||
|
|
||||||
import akka.parboiled2.CharPredicate
|
import akka.parboiled2.CharPredicate
|
||||||
|
|
||||||
import scala.annotation.tailrec
|
import scala.annotation.tailrec
|
||||||
|
|
@ -18,33 +20,42 @@ class BoyerMooreSpec extends WordSpec with Matchers {
|
||||||
"The Boyer Moore implementation" should {
|
"The Boyer Moore implementation" should {
|
||||||
|
|
||||||
"correctly find all matches of a few handwritten string-search examples" in {
|
"correctly find all matches of a few handwritten string-search examples" in {
|
||||||
find("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21)
|
findString("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21)
|
||||||
find("ana", "bananas") shouldEqual Seq(1, 3)
|
findString("ana", "bananas") shouldEqual Seq(1, 3)
|
||||||
find("anna", "bananas") shouldEqual Seq()
|
findString("anna", "bananas") shouldEqual Seq()
|
||||||
}
|
}
|
||||||
|
|
||||||
"perform identically to a regex search" in {
|
"perform identically to a regex search" in {
|
||||||
val random = new Random()
|
val random = new Random()
|
||||||
// the alphabet base is a random shuffle of 8 distinct alphanumeric chars
|
// the alphabet base is a random shuffle of 8 distinct alphanumeric chars
|
||||||
val alphabetBase = random.shuffle('0' to 'z' filter CharPredicate.AlphaNum) take 8
|
val alphabetBase: IndexedSeq[Byte] = random.shuffle(0 to 255).take(8).map(_.toByte)
|
||||||
|
|
||||||
val haystackLen = 1000
|
val haystackLen = 1000
|
||||||
(0 to 9) foreach { run ⇒
|
(0 to 9) foreach { run ⇒
|
||||||
val alphabet = alphabetBase.take(4 + random.nextInt(5)).mkString // 4 to 8 distinct alphanumeric chars
|
val alphabet = alphabetBase.take(4 + random.nextInt(5)) // 4 to 8 distinct alphanumeric chars
|
||||||
val randomAlphabetChars = Stream.continually(alphabet.charAt(random.nextInt(alphabet.length)))
|
val randomAlphabetChars = Stream.continually(alphabet(random.nextInt(alphabet.length)))
|
||||||
val haystack = randomAlphabetChars.take(haystackLen).mkString
|
def randomBytes(num: Int): ByteString = ByteString(randomAlphabetChars.take(num): _*)
|
||||||
val needle = randomAlphabetChars.take(run / 3 + 3).mkString // 3 to 6 random alphabet chars
|
val haystack = randomBytes(haystackLen)
|
||||||
|
val needle = randomBytes(run / 3 + 3) // 3 to 6 random alphabet chars
|
||||||
|
|
||||||
val bmFinds = find(needle, haystack, skipFindsThatStartInFinds = true)
|
val bmFinds = find(needle, haystack, skipFindsThatStartInFinds = true)
|
||||||
val reFinds = findWithRegex(needle, haystack)
|
val reFinds = findWithRegex(needle, haystack)
|
||||||
if (bmFinds != reFinds) {
|
if (bmFinds != reFinds) {
|
||||||
def showFind(ix: Int): String =
|
def showBytes(bs: Seq[Byte]): String = bs.map(b ⇒ (b & 0xff).formatted("%02x")).mkString(" ")
|
||||||
s"""...${haystack.substring(math.max(ix - 8, 0), math.min(ix + needle.length + 8, haystack.length))}...
|
def len(num: Int) = num * 2 + math.max(0, num - 1)
|
||||||
|${" " * (3 + math.min(ix - 8, 8))}${"^" * needle.length}
|
|
||||||
|
def showFind(ix: Int): String = {
|
||||||
|
val startIdx = math.max(ix - 8, 0)
|
||||||
|
val endIdx = math.min(ix + needle.length + 8, haystack.length)
|
||||||
|
|
||||||
|
s"""...${showBytes(haystack.drop(startIdx).take(endIdx - startIdx))}...
|
||||||
|
|${" " * (3 + math.min(8, ix) * 3)}${"^" * len(needle.length)}
|
||||||
|""".stripMargin
|
|""".stripMargin
|
||||||
|
}
|
||||||
val foundOnlyByBM = bmFinds.filterNot(reFinds.contains).map(showFind).mkString
|
val foundOnlyByBM = bmFinds.filterNot(reFinds.contains).map(showFind).mkString
|
||||||
val foundOnlyByRE = reFinds.filterNot(bmFinds.contains).map(showFind).mkString
|
val foundOnlyByRE = reFinds.filterNot(bmFinds.contains).map(showFind).mkString
|
||||||
fail(s"""alphabet: $alphabet
|
fail(s"""alphabet: ${showBytes(alphabet)}
|
||||||
|needle: $needle
|
|needle: ${showBytes(needle)}
|
||||||
|found only by boyer moore:
|
|found only by boyer moore:
|
||||||
|$foundOnlyByBM
|
|$foundOnlyByBM
|
||||||
|found only by regex:
|
|found only by regex:
|
||||||
|
|
@ -55,17 +66,20 @@ class BoyerMooreSpec extends WordSpec with Matchers {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def find(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = {
|
def findString(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] =
|
||||||
val boyerMoore = new BoyerMoore(needle.asciiBytes)
|
find(ByteString(needle), ByteString(haystack), skipFindsThatStartInFinds)
|
||||||
|
|
||||||
|
def find(needle: ByteString, haystack: ByteString, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = {
|
||||||
|
val boyerMoore = new BoyerMoore(needle.toArray[Byte])
|
||||||
@tailrec def rec(offset: Int, result: Seq[Int]): Seq[Int] = {
|
@tailrec def rec(offset: Int, result: Seq[Int]): Seq[Int] = {
|
||||||
val ix =
|
val ix =
|
||||||
try boyerMoore.nextIndex(ByteString(haystack), offset)
|
try boyerMoore.nextIndex(haystack, offset)
|
||||||
catch { case NotEnoughDataException ⇒ -1 }
|
catch { case NotEnoughDataException ⇒ -1 }
|
||||||
if (ix >= 0) rec(if (skipFindsThatStartInFinds) ix + needle.length else ix + 1, result :+ ix) else result
|
if (ix >= 0) rec(if (skipFindsThatStartInFinds) ix + needle.length else ix + 1, result :+ ix) else result
|
||||||
}
|
}
|
||||||
rec(0, Seq.empty)
|
rec(0, Seq.empty)
|
||||||
}
|
}
|
||||||
|
|
||||||
def findWithRegex(needle: String, haystack: String): Seq[Int] =
|
def findWithRegex(needle: ByteString, haystack: ByteString): Seq[Int] =
|
||||||
needle.r.findAllMatchIn(haystack).map(_.start).toSeq
|
Pattern.quote(needle.map(_.toChar).mkString).r.findAllMatchIn(haystack.map(_.toChar).mkString).map(_.start).toSeq
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue