Merge pull request #15666 from spray/w/15665-fix-BoyerMoore

=hco #15665 fix BoyerMoore for byte values > 0x7f
This commit is contained in:
Roland Kuhn 2014-08-19 12:39:46 +02:00
commit 2c8b8af8e0
2 changed files with 34 additions and 20 deletions

View file

@ -19,7 +19,7 @@ private class BoyerMoore(needle: Array[Byte]) {
val table = Array.fill(256)(needle.length) val table = Array.fill(256)(needle.length)
@tailrec def rec(i: Int): Unit = @tailrec def rec(i: Int): Unit =
if (i < nl1) { if (i < nl1) {
table(needle(i)) = nl1 - i table(needle(i) & 0xff) = nl1 - i
rec(i + 1) rec(i + 1)
} }
rec(0) rec(0)
@ -61,7 +61,7 @@ private class BoyerMoore(needle: Array[Byte]) {
if (needle(j) == byte) { if (needle(j) == byte) {
if (j == 0) i // found if (j == 0) i // found
else rec(i - 1, j - 1) else rec(i - 1, j - 1)
} else rec(i + math.max(offsetTable(nl1 - j), charTable(byte)), nl1) } else rec(i + math.max(offsetTable(nl1 - j), charTable(byte & 0xff)), nl1)
} }
rec(offset + nl1, nl1) rec(offset + nl1, nl1)
} }

View file

@ -4,6 +4,8 @@
package akka.http.parsing package akka.http.parsing
import java.util.regex.Pattern
import akka.parboiled2.CharPredicate import akka.parboiled2.CharPredicate
import scala.annotation.tailrec import scala.annotation.tailrec
@ -18,33 +20,42 @@ class BoyerMooreSpec extends WordSpec with Matchers {
"The Boyer Moore implementation" should { "The Boyer Moore implementation" should {
"correctly find all matches of a few handwritten string-search examples" in { "correctly find all matches of a few handwritten string-search examples" in {
find("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21) findString("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21)
find("ana", "bananas") shouldEqual Seq(1, 3) findString("ana", "bananas") shouldEqual Seq(1, 3)
find("anna", "bananas") shouldEqual Seq() findString("anna", "bananas") shouldEqual Seq()
} }
"perform identically to a regex search" in { "perform identically to a regex search" in {
val random = new Random() val random = new Random()
// the alphabet base is a random shuffle of 8 distinct alphanumeric chars // the alphabet base is a random shuffle of 8 distinct alphanumeric chars
val alphabetBase = random.shuffle('0' to 'z' filter CharPredicate.AlphaNum) take 8 val alphabetBase: IndexedSeq[Byte] = random.shuffle(0 to 255).take(8).map(_.toByte)
val haystackLen = 1000 val haystackLen = 1000
(0 to 9) foreach { run (0 to 9) foreach { run
val alphabet = alphabetBase.take(4 + random.nextInt(5)).mkString // 4 to 8 distinct alphanumeric chars val alphabet = alphabetBase.take(4 + random.nextInt(5)) // 4 to 8 distinct alphanumeric chars
val randomAlphabetChars = Stream.continually(alphabet.charAt(random.nextInt(alphabet.length))) val randomAlphabetChars = Stream.continually(alphabet(random.nextInt(alphabet.length)))
val haystack = randomAlphabetChars.take(haystackLen).mkString def randomBytes(num: Int): ByteString = ByteString(randomAlphabetChars.take(num): _*)
val needle = randomAlphabetChars.take(run / 3 + 3).mkString // 3 to 6 random alphabet chars val haystack = randomBytes(haystackLen)
val needle = randomBytes(run / 3 + 3) // 3 to 6 random alphabet chars
val bmFinds = find(needle, haystack, skipFindsThatStartInFinds = true) val bmFinds = find(needle, haystack, skipFindsThatStartInFinds = true)
val reFinds = findWithRegex(needle, haystack) val reFinds = findWithRegex(needle, haystack)
if (bmFinds != reFinds) { if (bmFinds != reFinds) {
def showFind(ix: Int): String = def showBytes(bs: Seq[Byte]): String = bs.map(b (b & 0xff).formatted("%02x")).mkString(" ")
s"""...${haystack.substring(math.max(ix - 8, 0), math.min(ix + needle.length + 8, haystack.length))}... def len(num: Int) = num * 2 + math.max(0, num - 1)
|${" " * (3 + math.min(ix - 8, 8))}${"^" * needle.length}
def showFind(ix: Int): String = {
val startIdx = math.max(ix - 8, 0)
val endIdx = math.min(ix + needle.length + 8, haystack.length)
s"""...${showBytes(haystack.drop(startIdx).take(endIdx - startIdx))}...
|${" " * (3 + math.min(8, ix) * 3)}${"^" * len(needle.length)}
|""".stripMargin |""".stripMargin
}
val foundOnlyByBM = bmFinds.filterNot(reFinds.contains).map(showFind).mkString val foundOnlyByBM = bmFinds.filterNot(reFinds.contains).map(showFind).mkString
val foundOnlyByRE = reFinds.filterNot(bmFinds.contains).map(showFind).mkString val foundOnlyByRE = reFinds.filterNot(bmFinds.contains).map(showFind).mkString
fail(s"""alphabet: $alphabet fail(s"""alphabet: ${showBytes(alphabet)}
|needle: $needle |needle: ${showBytes(needle)}
|found only by boyer moore: |found only by boyer moore:
|$foundOnlyByBM |$foundOnlyByBM
|found only by regex: |found only by regex:
@ -55,17 +66,20 @@ class BoyerMooreSpec extends WordSpec with Matchers {
} }
} }
def find(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = { def findString(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] =
val boyerMoore = new BoyerMoore(needle.asciiBytes) find(ByteString(needle), ByteString(haystack), skipFindsThatStartInFinds)
def find(needle: ByteString, haystack: ByteString, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = {
val boyerMoore = new BoyerMoore(needle.toArray[Byte])
@tailrec def rec(offset: Int, result: Seq[Int]): Seq[Int] = { @tailrec def rec(offset: Int, result: Seq[Int]): Seq[Int] = {
val ix = val ix =
try boyerMoore.nextIndex(ByteString(haystack), offset) try boyerMoore.nextIndex(haystack, offset)
catch { case NotEnoughDataException -1 } catch { case NotEnoughDataException -1 }
if (ix >= 0) rec(if (skipFindsThatStartInFinds) ix + needle.length else ix + 1, result :+ ix) else result if (ix >= 0) rec(if (skipFindsThatStartInFinds) ix + needle.length else ix + 1, result :+ ix) else result
} }
rec(0, Seq.empty) rec(0, Seq.empty)
} }
def findWithRegex(needle: String, haystack: String): Seq[Int] = def findWithRegex(needle: ByteString, haystack: ByteString): Seq[Int] =
needle.r.findAllMatchIn(haystack).map(_.start).toSeq Pattern.quote(needle.map(_.toChar).mkString).r.findAllMatchIn(haystack.map(_.toChar).mkString).map(_.start).toSeq
} }