diff --git a/akka-http-core/src/main/scala/akka/http/parsing/BoyerMoore.scala b/akka-http-core/src/main/scala/akka/http/parsing/BoyerMoore.scala index caca7c7163..ced45eac29 100644 --- a/akka-http-core/src/main/scala/akka/http/parsing/BoyerMoore.scala +++ b/akka-http-core/src/main/scala/akka/http/parsing/BoyerMoore.scala @@ -19,7 +19,7 @@ private class BoyerMoore(needle: Array[Byte]) { val table = Array.fill(256)(needle.length) @tailrec def rec(i: Int): Unit = if (i < nl1) { - table(needle(i)) = nl1 - i + table(needle(i) & 0xff) = nl1 - i rec(i + 1) } rec(0) @@ -61,7 +61,7 @@ private class BoyerMoore(needle: Array[Byte]) { if (needle(j) == byte) { if (j == 0) i // found else rec(i - 1, j - 1) - } else rec(i + math.max(offsetTable(nl1 - j), charTable(byte)), nl1) + } else rec(i + math.max(offsetTable(nl1 - j), charTable(byte & 0xff)), nl1) } rec(offset + nl1, nl1) } diff --git a/akka-http-core/src/test/scala/akka/http/parsing/BoyerMooreSpec.scala b/akka-http-core/src/test/scala/akka/http/parsing/BoyerMooreSpec.scala index c40f72adac..26af7378c4 100644 --- a/akka-http-core/src/test/scala/akka/http/parsing/BoyerMooreSpec.scala +++ b/akka-http-core/src/test/scala/akka/http/parsing/BoyerMooreSpec.scala @@ -4,6 +4,8 @@ package akka.http.parsing +import java.util.regex.Pattern + import akka.parboiled2.CharPredicate import scala.annotation.tailrec @@ -18,33 +20,42 @@ class BoyerMooreSpec extends WordSpec with Matchers { "The Boyer Moore implementation" should { "correctly find all matches of a few handwritten string-search examples" in { - find("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21) - find("ana", "bananas") shouldEqual Seq(1, 3) - find("anna", "bananas") shouldEqual Seq() + findString("foo", "the foe in moofoo is foobar") shouldEqual Seq(14, 21) + findString("ana", "bananas") shouldEqual Seq(1, 3) + findString("anna", "bananas") shouldEqual Seq() } "perform identically to a regex search" in { val random = new Random() // the alphabet base is a random shuffle of 8 distinct alphanumeric chars - val alphabetBase = random.shuffle('0' to 'z' filter CharPredicate.AlphaNum) take 8 + val alphabetBase: IndexedSeq[Byte] = random.shuffle(0 to 255).take(8).map(_.toByte) + val haystackLen = 1000 (0 to 9) foreach { run ⇒ - val alphabet = alphabetBase.take(4 + random.nextInt(5)).mkString // 4 to 8 distinct alphanumeric chars - val randomAlphabetChars = Stream.continually(alphabet.charAt(random.nextInt(alphabet.length))) - val haystack = randomAlphabetChars.take(haystackLen).mkString - val needle = randomAlphabetChars.take(run / 3 + 3).mkString // 3 to 6 random alphabet chars + val alphabet = alphabetBase.take(4 + random.nextInt(5)) // 4 to 8 distinct alphanumeric chars + val randomAlphabetChars = Stream.continually(alphabet(random.nextInt(alphabet.length))) + def randomBytes(num: Int): ByteString = ByteString(randomAlphabetChars.take(num): _*) + val haystack = randomBytes(haystackLen) + val needle = randomBytes(run / 3 + 3) // 3 to 6 random alphabet chars val bmFinds = find(needle, haystack, skipFindsThatStartInFinds = true) val reFinds = findWithRegex(needle, haystack) if (bmFinds != reFinds) { - def showFind(ix: Int): String = - s"""...${haystack.substring(math.max(ix - 8, 0), math.min(ix + needle.length + 8, haystack.length))}... - |${" " * (3 + math.min(ix - 8, 8))}${"^" * needle.length} + def showBytes(bs: Seq[Byte]): String = bs.map(b ⇒ (b & 0xff).formatted("%02x")).mkString(" ") + def len(num: Int) = num * 2 + math.max(0, num - 1) + + def showFind(ix: Int): String = { + val startIdx = math.max(ix - 8, 0) + val endIdx = math.min(ix + needle.length + 8, haystack.length) + + s"""...${showBytes(haystack.drop(startIdx).take(endIdx - startIdx))}... + |${" " * (3 + math.min(8, ix) * 3)}${"^" * len(needle.length)} |""".stripMargin + } val foundOnlyByBM = bmFinds.filterNot(reFinds.contains).map(showFind).mkString val foundOnlyByRE = reFinds.filterNot(bmFinds.contains).map(showFind).mkString - fail(s"""alphabet: $alphabet - |needle: $needle + fail(s"""alphabet: ${showBytes(alphabet)} + |needle: ${showBytes(needle)} |found only by boyer moore: |$foundOnlyByBM |found only by regex: @@ -55,17 +66,20 @@ class BoyerMooreSpec extends WordSpec with Matchers { } } - def find(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = { - val boyerMoore = new BoyerMoore(needle.asciiBytes) + def findString(needle: String, haystack: String, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = + find(ByteString(needle), ByteString(haystack), skipFindsThatStartInFinds) + + def find(needle: ByteString, haystack: ByteString, skipFindsThatStartInFinds: Boolean = false): Seq[Int] = { + val boyerMoore = new BoyerMoore(needle.toArray[Byte]) @tailrec def rec(offset: Int, result: Seq[Int]): Seq[Int] = { val ix = - try boyerMoore.nextIndex(ByteString(haystack), offset) + try boyerMoore.nextIndex(haystack, offset) catch { case NotEnoughDataException ⇒ -1 } if (ix >= 0) rec(if (skipFindsThatStartInFinds) ix + needle.length else ix + 1, result :+ ix) else result } rec(0, Seq.empty) } - def findWithRegex(needle: String, haystack: String): Seq[Int] = - needle.r.findAllMatchIn(haystack).map(_.start).toSeq + def findWithRegex(needle: ByteString, haystack: ByteString): Seq[Int] = + Pattern.quote(needle.map(_.toChar).mkString).r.findAllMatchIn(haystack.map(_.toChar).mkString).map(_.start).toSeq }