pekko/akka-cluster/src/multi-jvm/scala/akka/cluster/RestartNodeSpec.scala

/**
 * Copyright (C) 2009-2018 Lightbend Inc. <https://www.lightbend.com>
 */

package akka.cluster

import scala.collection.immutable
import scala.concurrent.duration._

import akka.Done
import akka.actor.Actor
import akka.actor.ActorIdentity
import akka.actor.ActorRef
import akka.actor.ActorSystem
import akka.actor.Address
import akka.actor.Deploy
import akka.actor.Identify
import akka.actor.Props
import akka.actor.RootActorPath
import akka.actor.Terminated
import akka.cluster.MemberStatus._
import akka.remote.testkit.MultiNodeConfig
import akka.remote.testkit.MultiNodeSpec
import akka.testkit._
import com.typesafe.config.ConfigFactory

object RestartNodeMultiJvmSpec extends MultiNodeConfig {
  val first = role("first")
  val second = role("second")
  val third = role("third")

  commonConfig(debugConfig(on = false).
    withFallback(ConfigFactory.parseString("""
      akka.cluster.auto-down-unreachable-after = 5s
      akka.cluster.allow-weakly-up-members = off
      #akka.remote.use-passive-connections = off
      """)).
    withFallback(MultiNodeClusterSpec.clusterConfig))

  /**
   * This was used together with sleep in EndpointReader before deliverAndAck
   * to reproduce issue with misaligned ACKs when restarting system,
   * issue #19780
   */
  class Watcher(a: Address, replyTo: ActorRef) extends Actor {
    context.actorSelection(RootActorPath(a) / "user" / "address-receiver") ! Identify(None)

    def receive = {
      case ActorIdentity(None, Some(ref)) ⇒
        context.watch(ref)
        replyTo ! Done
      case t: Terminated ⇒
    }
  }
}

class RestartNodeMultiJvmNode1 extends RestartNodeSpec
class RestartNodeMultiJvmNode2 extends RestartNodeSpec
class RestartNodeMultiJvmNode3 extends RestartNodeSpec

abstract class RestartNodeSpec
  extends MultiNodeSpec(RestartNodeMultiJvmSpec)
  with MultiNodeClusterSpec with ImplicitSender {

  import RestartNodeMultiJvmSpec._

  @volatile var secondUniqueAddress: UniqueAddress = _

  // use a separate ActorSystem, to be able to simulate restart
  lazy val secondSystem = ActorSystem(system.name, system.settings.config)

  def seedNodes: immutable.IndexedSeq[Address] = Vector(first, secondUniqueAddress.address, third)

  lazy val restartedSecondSystem = ActorSystem(
    system.name,
    ConfigFactory.parseString(s"""
      akka.remote.netty.tcp.port = ${secondUniqueAddress.address.port.get}
      akka.remote.artery.canonical.port = ${secondUniqueAddress.address.port.get}
      """).withFallback(system.settings.config))

  override def afterAll(): Unit = {
    runOn(second) {
      if (secondSystem.whenTerminated.isCompleted)
        shutdown(restartedSecondSystem)
      else
        shutdown(secondSystem)
    }
    super.afterAll()
  }

  "Cluster nodes" must {
    "be able to restart and join again" taggedAs LongRunningTest in within(60.seconds) {
      // secondSystem is a separate ActorSystem, to be able to simulate restart
      // we must transfer its address to first
      runOn(first, third) {
        system.actorOf(Props(new Actor {
          def receive = {
            case a: UniqueAddress ⇒
              secondUniqueAddress = a
              sender() ! "ok"
          }
        }).withDeploy(Deploy.local), name = "address-receiver")
        enterBarrier("second-address-receiver-ready")
      }

      runOn(second) {
        enterBarrier("second-address-receiver-ready")
        secondUniqueAddress = Cluster(secondSystem).selfUniqueAddress
        List(first, third) foreach { r ⇒
          system.actorSelection(RootActorPath(r) / "user" / "address-receiver") ! secondUniqueAddress
          expectMsg(5.seconds, "ok")
        }
      }
      enterBarrier("second-address-transferred")

      // now we can join first, secondSystem, third together
      runOn(first, third) {
        cluster.joinSeedNodes(seedNodes)
        awaitMembersUp(3)
      }
      runOn(second) {
        Cluster(secondSystem).joinSeedNodes(seedNodes)
        awaitAssert(Cluster(secondSystem).readView.members.size should ===(3))
        awaitAssert(Cluster(secondSystem).readView.members.map(_.status) should ===(Set(Up)))
      }
      enterBarrier("started")

      // shutdown secondSystem
      runOn(second) {
        // send system message just before shutdown, reproducer for issue #19780
        secondSystem.actorOf(Props(classOf[Watcher], address(first), testActor), "testwatcher")
        expectMsg(Done)

        shutdown(secondSystem, remaining)
      }
      enterBarrier("second-shutdown")

      // then immediately start restartedSecondSystem, which has the same address as secondSystem
      runOn(second) {
        Cluster(restartedSecondSystem).joinSeedNodes(seedNodes)
        awaitAssert(Cluster(restartedSecondSystem).readView.members.size should ===(3))
        awaitAssert(Cluster(restartedSecondSystem).readView.members.map(_.status) should ===(Set(Up)))
      }
      runOn(first, third) {
        awaitAssert {
          Cluster(system).readView.members.size should ===(3)
          Cluster(system).readView.members.exists { m ⇒
            m.address == secondUniqueAddress.address && m.uniqueAddress.longUid != secondUniqueAddress.longUid
          }
        }
      }
      enterBarrier("second-restarted")

    }

  }
}
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`/**`
Update copyright to 2018 (#24241) 2018-01-04 17:26:29 +00:00			`* Copyright (C) 2009-2018 Lightbend Inc. <https://www.lightbend.com>`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`*/`
Enable header plugin for the MultiJVM configuration (#24974) Seems when did the changes for 2018 it intro introduced a space in all after, hence so many changes. 2018-04-24 16:03:55 +01:00
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`package akka.cluster`

			`import scala.collection.immutable`
			`import scala.concurrent.duration._`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00
			`import akka.Done`
			`import akka.actor.Actor`
			`import akka.actor.ActorIdentity`
			`import akka.actor.ActorRef`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`import akka.actor.ActorSystem`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`import akka.actor.Address`
			`import akka.actor.Deploy`
			`import akka.actor.Identify`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`import akka.actor.Props`
			`import akka.actor.RootActorPath`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`import akka.actor.Terminated`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`import akka.cluster.MemberStatus._`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`import akka.remote.testkit.MultiNodeConfig`
			`import akka.remote.testkit.MultiNodeSpec`
			`import akka.testkit._`
			`import com.typesafe.config.ConfigFactory`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00
			`object RestartNodeMultiJvmSpec extends MultiNodeConfig {`
			`val first = role("first")`
			`val second = role("second")`
			`val third = role("third")`

			`commonConfig(debugConfig(on = false).`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`withFallback(ConfigFactory.parseString("""`
			`akka.cluster.auto-down-unreachable-after = 5s`
disable weakly-up for some tests 2017-01-25 07:20:24 +01:00			`akka.cluster.allow-weakly-up-members = off`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`#akka.remote.use-passive-connections = off`
			`""")).`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`withFallback(MultiNodeClusterSpec.clusterConfig))`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00
			`/**`
			`* This was used together with sleep in EndpointReader before deliverAndAck`
			`* to reproduce issue with misaligned ACKs when restarting system,`
			`* issue #19780`
			`*/`
			`class Watcher(a: Address, replyTo: ActorRef) extends Actor {`
			`context.actorSelection(RootActorPath(a) / "user" / "address-receiver") ! Identify(None)`

			`def receive = {`
			`case ActorIdentity(None, Some(ref)) ⇒`
			`context.watch(ref)`
			`replyTo ! Done`
			`case t: Terminated ⇒`
			`}`
			`}`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`}`

			`class RestartNodeMultiJvmNode1 extends RestartNodeSpec`
			`class RestartNodeMultiJvmNode2 extends RestartNodeSpec`
			`class RestartNodeMultiJvmNode3 extends RestartNodeSpec`

			`abstract class RestartNodeSpec`
			`extends MultiNodeSpec(RestartNodeMultiJvmSpec)`
			`with MultiNodeClusterSpec with ImplicitSender {`

			`import RestartNodeMultiJvmSpec._`

			`@volatile var secondUniqueAddress: UniqueAddress = _`

			`// use a separate ActorSystem, to be able to simulate restart`
			`lazy val secondSystem = ActorSystem(system.name, system.settings.config)`

			`def seedNodes: immutable.IndexedSeq[Address] = Vector(first, secondUniqueAddress.address, third)`

Update to a working version of Scalariform 2016-06-02 14:06:57 +02:00			`lazy val restartedSecondSystem = ActorSystem(`
			`system.name,`
add CoordinatedShutdown, #21537 * CoordinatedShutdown that can run tasks for configured phases in order (DAG) * coordinate handover/shutdown of singleton with cluster exiting/shutdown * phase config obj with depends-on list * integrate graceful leaving of sharding in coordinated shutdown * add timeout and recover * add some missing artery ports to tests * leave via CoordinatedShutdown.run * optionally exit-jvm in last phase * run via jvm shutdown hook * send ExitingConfirmed to leader before shutdown of Exiting to not have to wait for failure detector to mark it as unreachable before removing * the unreachable signal is still kept as a safe guard if message is lost or leader dies * PhaseClusterExiting vs MemberExited in ClusterSingletonManager * terminate ActorSystem when cluster shutdown (via Down) * add more predefined and custom phases * reference documentation * migration guide * problem when the leader order was sys2, sys1, sys3, then sys3 could not perform it's duties and move Leving sys1 to Exiting because it was observing sys1 as unreachable * exclude Leaving with exitingConfirmed from convergence condidtion 2016-12-01 18:49:38 +01:00			`ConfigFactory.parseString(s"""`
			`akka.remote.netty.tcp.port = ${secondUniqueAddress.address.port.get}`
			`akka.remote.artery.canonical.port = ${secondUniqueAddress.address.port.get}`
			`""").withFallback(system.settings.config))`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00
			`override def afterAll(): Unit = {`
			`runOn(second) {`
=clu #19274 failure detection of joining/down member status * Failure detection heartbeating was not performed to joining nodes, since it was expected that they will become Up first. * If a joining node is downed before it is changed to Up failure detection will not be performed for that node. That resulted in the downed node will not be removed from membership, since the unreachability signal is used as confirmation that the node is actually stopped before removing it. 2015-12-26 11:30:18 +01:00			`if (secondSystem.whenTerminated.isCompleted)`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`shutdown(restartedSecondSystem)`
			`else`
			`shutdown(secondSystem)`
			`}`
			`super.afterAll()`
			`}`

			`"Cluster nodes" must {`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`"be able to restart and join again" taggedAs LongRunningTest in within(60.seconds) {`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`// secondSystem is a separate ActorSystem, to be able to simulate restart`
			`// we must transfer its address to first`
			`runOn(first, third) {`
			`system.actorOf(Props(new Actor {`
			`def receive = {`
			`case a: UniqueAddress ⇒`
			`secondUniqueAddress = a`
			`sender() ! "ok"`
			`}`
			`}).withDeploy(Deploy.local), name = "address-receiver")`
			`enterBarrier("second-address-receiver-ready")`
			`}`

			`runOn(second) {`
			`enterBarrier("second-address-receiver-ready")`
			`secondUniqueAddress = Cluster(secondSystem).selfUniqueAddress`
			`List(first, third) foreach { r ⇒`
			`system.actorSelection(RootActorPath(r) / "user" / "address-receiver") ! secondUniqueAddress`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`expectMsg(5.seconds, "ok")`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`}`
			`}`
Fix several minor typos detected by github.com/client9/misspell (#25448) * Fix several minor typos detected by github.com/client9/misspell * Revert s/erminater/erminator/ in /ActorSystemSpec 2018-08-21 11:02:37 +09:00			`enterBarrier("second-address-transferred")`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00
			`// now we can join first, secondSystem, third together`
			`runOn(first, third) {`
			`cluster.joinSeedNodes(seedNodes)`
			`awaitMembersUp(3)`
			`}`
			`runOn(second) {`
			`Cluster(secondSystem).joinSeedNodes(seedNodes)`
=tes Use ConversionCheckedTripleEquals 2015-01-16 11:09:59 +01:00			`awaitAssert(Cluster(secondSystem).readView.members.size should ===(3))`
			`awaitAssert(Cluster(secondSystem).readView.members.map(_.status) should ===(Set(Up)))`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`}`
			`enterBarrier("started")`

			`// shutdown secondSystem`
			`runOn(second) {`
rem #19780: Skip acks during connection handoff * The problem: ACK that was targeted to an old incarnation was sent to the new, restarted, system with same host:port, and therefore resulting issues noticed as "Error encountered while processing system message acknowledgement buffer: [-1 {}] ack: ACK[0, {}]" when restarting actor system * The reason: 1. The endpoint reader was about to send OutgoingAck to parent reader, targeted to the old system. 2. At the same time there is an incoming connection from new system that triggered TakeOver in the endpoint writer, i.e. replacing the handle to the connection of the new system. 3. The OutgoingAck is received by the writer, which happily sends it to the new handle, the new system. * The solution: Ignore OutgoingAck during the handoff (TakeOver) process. 2016-03-21 08:41:11 +01:00			`// send system message just before shutdown, reproducer for issue #19780`
			`secondSystem.actorOf(Props(classOf[Watcher], address(first), testActor), "testwatcher")`
			`expectMsg(Done)`

=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`shutdown(secondSystem, remaining)`
			`}`
			`enterBarrier("second-shutdown")`

			`// then immediately start restartedSecondSystem, which has the same address as secondSystem`
			`runOn(second) {`
			`Cluster(restartedSecondSystem).joinSeedNodes(seedNodes)`
=tes Use ConversionCheckedTripleEquals 2015-01-16 11:09:59 +01:00			`awaitAssert(Cluster(restartedSecondSystem).readView.members.size should ===(3))`
			`awaitAssert(Cluster(restartedSecondSystem).readView.members.map(_.status) should ===(Set(Up)))`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`}`
			`runOn(first, third) {`
			`awaitAssert {`
=tes Use ConversionCheckedTripleEquals 2015-01-16 11:09:59 +01:00			`Cluster(system).readView.members.size should ===(3)`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`Cluster(system).readView.members.exists { m ⇒`
Use long uid in artery remoting and cluster #20644 2016-09-26 15:34:59 +02:00			`m.address == secondUniqueAddress.address && m.uniqueAddress.longUid != secondUniqueAddress.longUid`
=clu #16224 Add test for cluster node restart 2014-11-10 15:12:14 +01:00			`}`
			`}`
			`}`
			`enterBarrier("second-restarted")`

			`}`

			`}`
			`}`