rem #19780: Skip acks during connection handoff

* The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process.
2016-03-21 08:41:11 +01:00 · 2016-03-21 08:41:11 +01:00 · 96b68f6437
commit 96b68f6437
parent ed6acd63ec
2 changed files with 47 additions and 14 deletions
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/RestartNodeSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/RestartNodeSpec.scala
@ -3,21 +3,25 @@
 */
 package akka.cluster

-import language.postfixOps
 import scala.collection.immutable
-import com.typesafe.config.ConfigFactory
-import org.scalatest.BeforeAndAfter
+import scala.concurrent.duration._
+
+import akka.Done
+import akka.actor.Actor
+import akka.actor.ActorIdentity
+import akka.actor.ActorRef
+import akka.actor.ActorSystem
+import akka.actor.Address
+import akka.actor.Deploy
+import akka.actor.Identify
+import akka.actor.Props
+import akka.actor.RootActorPath
+import akka.actor.Terminated
+import akka.cluster.MemberStatus._
 import akka.remote.testkit.MultiNodeConfig
 import akka.remote.testkit.MultiNodeSpec
 import akka.testkit._
-import scala.concurrent.duration._
-import akka.actor.Address
-import akka.actor.ActorSystem
-import akka.actor.Props
-import akka.actor.Actor
-import akka.actor.RootActorPath
-import akka.cluster.MemberStatus._
-import akka.actor.Deploy
+import com.typesafe.config.ConfigFactory

 object RestartNodeMultiJvmSpec extends MultiNodeConfig {
  val first = role("first")
@ -25,8 +29,27 @@ object RestartNodeMultiJvmSpec extends MultiNodeConfig {
  val third = role("third")

  commonConfig(debugConfig(on = false).
-    withFallback(ConfigFactory.parseString("akka.cluster.auto-down-unreachable-after = 5s")).
+    withFallback(ConfigFactory.parseString("""
+      akka.cluster.auto-down-unreachable-after = 5s
+      #akka.remote.use-passive-connections = off
+      """)).
    withFallback(MultiNodeClusterSpec.clusterConfig))
+
+  /**
+   * This was used together with sleep in EndpointReader before deliverAndAck
+   * to reproduce issue with misaligned ACKs when restarting system,
+   * issue #19780
+   */
+  class Watcher(a: Address, replyTo: ActorRef) extends Actor {
+    context.actorSelection(RootActorPath(a) / "user" / "address-receiver") ! Identify(None)
+
+    def receive = {
+      case ActorIdentity(None, Some(ref)) ⇒
+        context.watch(ref)
+        replyTo ! Done
+      case t: Terminated ⇒
+    }
+  }
 }

 class RestartNodeMultiJvmNode1 extends RestartNodeSpec
@ -61,7 +84,7 @@ abstract class RestartNodeSpec
  }

  "Cluster nodes" must {
-    "be able to restart and join again" taggedAs LongRunningTest in within(60 seconds) {
+    "be able to restart and join again" taggedAs LongRunningTest in within(60.seconds) {
      // secondSystem is a separate ActorSystem, to be able to simulate restart
      // we must transfer its address to first
      runOn(first, third) {
@ -80,7 +103,7 @@ abstract class RestartNodeSpec
        secondUniqueAddress = Cluster(secondSystem).selfUniqueAddress
        List(first, third) foreach { r ⇒
          system.actorSelection(RootActorPath(r) / "user" / "address-receiver") ! secondUniqueAddress
-          expectMsg(5 seconds, "ok")
+          expectMsg(5.seconds, "ok")
        }
      }
      enterBarrier("second-address-transfered")
@ -99,6 +122,10 @@ abstract class RestartNodeSpec

      // shutdown secondSystem
      runOn(second) {
+        // send system message just before shutdown, reproducer for issue #19780
+        secondSystem.actorOf(Props(classOf[Watcher], address(first), testActor), "testwatcher")
+        expectMsg(Done)
+
        shutdown(secondSystem, remaining)
      }
      enterBarrier("second-shutdown")