From a6737b5e4202e9739234afe31e20d67d2661a300 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Mon, 24 Sep 2018 14:05:13 +0200 Subject: [PATCH] Don't automatically down quarantined node, #25632 --- .../scala/akka/cluster/ClusterDaemon.scala | 4 +-- .../SurviveNetworkInstabilitySpec.scala | 27 ++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala index f5dcc20620..62b41d5056 100644 --- a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala @@ -855,10 +855,10 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh val newGossip = localGossip copy (overview = newOverview) updateLatestGossip(newGossip) log.warning( - "Cluster Node [{}] - Marking node as TERMINATED [{}], due to quarantine. Node roles [{}]", + "Cluster Node [{}] - Marking node as TERMINATED [{}], due to quarantine. Node roles [{}]. " + + "It must still be marked as down before it's removed.", selfAddress, node.address, selfRoles.mkString(",")) publishMembershipState() - downing(node.address) } } diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/SurviveNetworkInstabilitySpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/SurviveNetworkInstabilitySpec.scala index 68b5e28c48..0908f2f808 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/SurviveNetworkInstabilitySpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/SurviveNetworkInstabilitySpec.scala @@ -256,7 +256,7 @@ abstract class SurviveNetworkInstabilitySpec assertCanTalk((joining ++ others :+ first): _*) } - "down and remove quarantined node" taggedAs LongRunningTest in within(60.seconds) { + "mark quarantined node with reachability status Terminated" taggedAs LongRunningTest in within(60.seconds) { val others = Vector(first, third, fourth, fifth, sixth, seventh) runOn(third) { @@ -295,9 +295,30 @@ abstract class SurviveNetworkInstabilitySpec enterBarrier("quarantined") runOn(others: _*) { - // second should be removed because of quarantine - awaitAssert(clusterView.members.map(_.address) should not contain (address(second))) + // not be downed, see issue #25632 + Thread.sleep(2000) + val secondUniqueAddress = cluster.state.members.find(_.address == address(second)) match { + case None ⇒ fail("Unexpected removal of quarantined node") + case Some(m) ⇒ + m.status should ===(MemberStatus.Up) // not Down + m.uniqueAddress + } + + // second should be marked with reachability status Terminated removed because of quarantine + awaitAssert(clusterView.reachability.status(secondUniqueAddress) should ===(Reachability.Terminated)) } + enterBarrier("reachability-terminated") + + runOn(fourth) { + cluster.down(address(second)) + } + runOn(others: _*) { + // second should be removed because of quarantine + awaitAssert(clusterView.members.map(_.address) should not contain address(second)) + // and also removed from reachability table + awaitAssert(clusterView.reachability.allUnreachableOrTerminated should ===(Set.empty)) + } + enterBarrier("removed-after-down") enterBarrier("after-6") assertCanTalk(others: _*)