Harden ShardCoordinator state replication, #28856 (#28895) (#29094)

* Possibility to prefer oldest in ddata writes and reads * enabled for Cluster Sharding * New ReadMajorityPlus and WriteMajorityPlus * used by Cluster Sharding, with configuration * also possible to define ReadAll in config (cherry picked from commit 4ba835d328)
2020-05-20 08:34:22 +02:00 · 2020-05-20 08:34:22 +02:00 · 3deffc8de2
commit 3deffc8de2
parent 9299f387dd
16 changed files with 758 additions and 190 deletions
--- a/akka-cluster-sharding/src/multi-jvm/scala/akka/cluster/sharding/ClusterShardingLeavingSpec.scala
+++ b/akka-cluster-sharding/src/multi-jvm/scala/akka/cluster/sharding/ClusterShardingLeavingSpec.scala
@ -7,9 +7,12 @@ package akka.cluster.sharding
 import scala.concurrent.duration._

 import akka.actor.{ Actor, ActorRef, Props }
+import akka.cluster.MemberStatus
 import akka.serialization.jackson.CborSerializable
 import akka.testkit._
+import akka.util.ccompat._

+@ccompatUsedUntil213
 object ClusterShardingLeavingSpec {
  case class Ping(id: String) extends CborSerializable

@ -39,11 +42,21 @@ object ClusterShardingLeavingSpec {
  }
 }

-abstract class ClusterShardingLeavingSpecConfig(mode: String) extends MultiNodeClusterShardingConfig(mode) {
+abstract class ClusterShardingLeavingSpecConfig(mode: String)
+    extends MultiNodeClusterShardingConfig(
+      mode,
+      loglevel = "INFO",
+      additionalConfig = """
+        akka.cluster.sharding.rebalance-interval = 120 s
+        akka.cluster.sharding.distributed-data.majority-min-cap = 1
+        akka.cluster.sharding.coordinator-state.write-majority-plus = 1
+        akka.cluster.sharding.coordinator-state.read-majority-plus = 1
+      """) {
  val first = role("first")
  val second = role("second")
  val third = role("third")
  val fourth = role("fourth")
+  val fifth = role("fifth")

 }

@ -60,11 +73,13 @@ class PersistentClusterShardingLeavingMultiJvmNode1 extends PersistentClusterSha
 class PersistentClusterShardingLeavingMultiJvmNode2 extends PersistentClusterShardingLeavingSpec
 class PersistentClusterShardingLeavingMultiJvmNode3 extends PersistentClusterShardingLeavingSpec
 class PersistentClusterShardingLeavingMultiJvmNode4 extends PersistentClusterShardingLeavingSpec
+class PersistentClusterShardingLeavingMultiJvmNode5 extends PersistentClusterShardingLeavingSpec

 class DDataClusterShardingLeavingMultiJvmNode1 extends DDataClusterShardingLeavingSpec
 class DDataClusterShardingLeavingMultiJvmNode2 extends DDataClusterShardingLeavingSpec
 class DDataClusterShardingLeavingMultiJvmNode3 extends DDataClusterShardingLeavingSpec
 class DDataClusterShardingLeavingMultiJvmNode4 extends DDataClusterShardingLeavingSpec
+class DDataClusterShardingLeavingMultiJvmNode5 extends DDataClusterShardingLeavingSpec

 abstract class ClusterShardingLeavingSpec(multiNodeConfig: ClusterShardingLeavingSpecConfig)
    extends MultiNodeClusterShardingSpec(multiNodeConfig)
@ -89,9 +104,16 @@ abstract class ClusterShardingLeavingSpec(multiNodeConfig: ClusterShardingLeavin
      startPersistenceIfNeeded(startOn = first, setStoreOn = roles)

      join(first, first, onJoinedRunOnFrom = startSharding())
-      join(second, first, onJoinedRunOnFrom = startSharding())
-      join(third, first, onJoinedRunOnFrom = startSharding())
-      join(fourth, first, onJoinedRunOnFrom = startSharding())
+      join(second, first, onJoinedRunOnFrom = startSharding(), assertNodeUp = false)
+      join(third, first, onJoinedRunOnFrom = startSharding(), assertNodeUp = false)
+      join(fourth, first, onJoinedRunOnFrom = startSharding(), assertNodeUp = false)
+      join(fifth, first, onJoinedRunOnFrom = startSharding(), assertNodeUp = false)
+
+      // all Up, everywhere before continuing
+      awaitAssert {
+        cluster.state.members.size should ===(roles.size)
+        cluster.state.members.unsorted.map(_.status) should ===(Set(MemberStatus.Up))
+      }

      enterBarrier("after-2")
    }
@ -105,6 +127,7 @@ abstract class ClusterShardingLeavingSpec(multiNodeConfig: ClusterShardingLeavin
          id -> expectMsgType[ActorRef]
        }).toMap
        shardLocations ! Locations(locations)
+        system.log.debug("Original locations: {}", locations)
      }
      enterBarrier("after-3")
    }
@ -112,28 +135,36 @@ abstract class ClusterShardingLeavingSpec(multiNodeConfig: ClusterShardingLeavin
    "recover after leaving coordinator node" in {
      system.actorSelection(node(first) / "user" / "shardLocations") ! GetLocations
      val Locations(originalLocations) = expectMsgType[Locations]
-      val firstAddress = node(first).address

-      runOn(third) {
-        cluster.leave(node(first).address)
+      val numberOfNodesLeaving = 2
+      val leavingRoles = roles.take(numberOfNodesLeaving)
+      val leavingNodes = leavingRoles.map(address)
+      val remainingRoles = roles.drop(numberOfNodesLeaving)
+
+      runOn(roles.last) {
+        leavingNodes.foreach { a =>
+          cluster.leave(a)
+        }
      }

-      runOn(first) {
+      runOn(leavingRoles: _*) {
        watch(region)
        expectTerminated(region, 15.seconds)
      }
-      enterBarrier("stopped")
+      // more stress by not having the barrier here

-      runOn(second, third, fourth) {
+      runOn(remainingRoles: _*) {
        within(15.seconds) {
          awaitAssert {
            val probe = TestProbe()
            originalLocations.foreach {
              case (id, ref) =>
                region.tell(Ping(id), probe.ref)
-                if (ref.path.address == firstAddress)
-                  probe.expectMsgType[ActorRef](1.second) should not be (ref)
-                else
+                if (leavingNodes.contains(ref.path.address)) {
+                  val newRef = probe.expectMsgType[ActorRef](1.second)
+                  newRef should not be (ref)
+                  system.log.debug("Moved [{}] from [{}] to [{}]", id, ref, newRef)
+                } else
                  probe.expectMsg(1.second, ref) // should not move
            }
          }