Merge pull request #15666 from spray/w/15665-fix-BoyerMoore

=hco #15665 fix BoyerMoore for byte values > 0x7f
This commit is contained in:
Roland Kuhn 2014-08-19 12:39:46 +02:00
commit 2c8b8af8e0
2 changed files with 34 additions and 20 deletions

View file

@ -19,7 +19,7 @@ private class BoyerMoore(needle: Array[Byte]) {
val table = Array.fill(256)(needle.length)
@tailrec def rec(i: Int): Unit =
if (i < nl1) {
table(needle(i)) = nl1 - i
table(needle(i) & 0xff) = nl1 - i
rec(i + 1)
}
rec(0)
@ -61,7 +61,7 @@ private class BoyerMoore(needle: Array[Byte]) {
if (needle(j) == byte) {
if (j == 0) i // found
else rec(i - 1, j - 1)
} else rec(i + math.max(offsetTable(nl1 - j), charTable(byte)), nl1)
} else rec(i + math.max(offsetTable(nl1 - j), charTable(byte & 0xff)), nl1)
}
rec(offset + nl1, nl1)
}

View file

@ -4,6 +4,8 @@
package akka.http.parsing
import java.util.regex.Pattern
import akka.parboiled2.CharPredicate
import scala.annotation.tailrec
@ -18,33 +20,42 @@ class BoyerMooreSpec extends WordSpec with Matchers {
"The Boyer Moore implementation" should {
"correctly find all matches of a few handwritten string-search examples" in {
find("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21)
find("ana", "bananas") shouldEqual Seq(1, 3)
find("anna", "bananas") shouldEqual Seq()
findString("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21)
findString("ana", "bananas") shouldEqual Seq(1, 3)
findString("anna", "bananas") shouldEqual Seq()
}
"perform identically to a regex search" in {
val random = new Random()
// the alphabet base is a random shuffle of 8 distinct alphanumeric chars
val alphabetBase = random.shuffle('0' to 'z' filter CharPredicate.AlphaNum) take 8
val alphabetBase: IndexedSeq[Byte] = random.shuffle(0 to 255).take(8).map(_.toByte)
val haystackLen = 1000
(0 to 9) foreach { run
val alphabet = alphabetBase.take(4 + random.nextInt(5)).mkString // 4 to 8 distinct alphanumeric chars
val randomAlphabetChars = Stream.continually(alphabet.charAt(random.nextInt(alphabet.length)))
val haystack = randomAlphabetChars.take(haystackLen).mkString
val needle = randomAlphabetChars.take(run / 3 + 3).mkString // 3 to 6 random alphabet chars
val alphabet = alphabetBase.take(4 + random.nextInt(5)) // 4 to 8 distinct alphanumeric chars
val randomAlphabetChars = Stream.continually(alphabet(random.nextInt(alphabet.length)))
def randomBytes(num: Int): ByteString = ByteString(randomAlphabetChars.take(num): _*)
val haystack = randomBytes(haystackLen)
val needle = randomBytes(run / 3 + 3) // 3 to 6 random alphabet chars
val bmFinds = find(needle, haystack, skipFindsThatStartInFinds = true)
val reFinds = findWithRegex(needle, haystack)
if (bmFinds != reFinds) {
def showFind(ix: Int): String =
s"""...${haystack.substring(math.max(ix - 8, 0), math.min(ix + needle.length + 8, haystack.length))}...
|${" " * (3 + math.min(ix - 8, 8))}${"^" * needle.length}
def showBytes(bs: Seq[Byte]): String = bs.map(b (b & 0xff).formatted("%02x")).mkString(" ")
def len(num: Int) = num * 2 + math.max(0, num - 1)
def showFind(ix: Int): String = {
val startIdx = math.max(ix - 8, 0)
val endIdx = math.min(ix + needle.length + 8, haystack.length)
s"""...${showBytes(haystack.drop(startIdx).take(endIdx - startIdx))}...
|${" " * (3 + math.min(8, ix) * 3)}${"^" * len(needle.length)}
|""".stripMargin
}
val foundOnlyByBM = bmFinds.filterNot(reFinds.contains).map(showFind).mkString
val foundOnlyByRE = reFinds.filterNot(bmFinds.contains).map(showFind).mkString
fail(s"""alphabet: $alphabet
|needle: $needle
fail(s"""alphabet: ${showBytes(alphabet)}
|needle: ${showBytes(needle)}
|found only by boyer moore:
|$foundOnlyByBM
|found only by regex:
@ -55,17 +66,20 @@ class BoyerMooreSpec extends WordSpec with Matchers {
}
}
def find(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = {
val boyerMoore = new BoyerMoore(needle.asciiBytes)
def findString(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] =
find(ByteString(needle), ByteString(haystack), skipFindsThatStartInFinds)
def find(needle: ByteString, haystack: ByteString, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = {
val boyerMoore = new BoyerMoore(needle.toArray[Byte])
@tailrec def rec(offset: Int, result: Seq[Int]): Seq[Int] = {
val ix =
try boyerMoore.nextIndex(ByteString(haystack), offset)
try boyerMoore.nextIndex(haystack, offset)
catch { case NotEnoughDataException -1 }
if (ix >= 0) rec(if (skipFindsThatStartInFinds) ix + needle.length else ix + 1, result :+ ix) else result
}
rec(0, Seq.empty)
}
def findWithRegex(needle: String, haystack: String): Seq[Int] =
needle.r.findAllMatchIn(haystack).map(_.start).toSeq
def findWithRegex(needle: ByteString, haystack: ByteString): Seq[Int] =
Pattern.quote(needle.map(_.toChar).mkString).r.findAllMatchIn(haystack.map(_.toChar).mkString).map(_.start).toSeq
}