RebalanceWorker should watch shard regions (#27261)

* RebalanceWorker should watch shard regions Fixes #27259. The RebalanceWorker actor needs to watch the shard regions that it's expecting a BeginHandOffAck message from, in case the ShardRegion shuts down before it can receive the BeginHandOff message, preventing hand off. This can be a problem when two nodes are shut down at about the same time.
2019-08-16 00:36:02 +10:00 · 2019-08-16 00:36:02 +10:00 · bbff92ade6
commit bbff92ade6
parent c1eb0719da
2 changed files with 34 additions and 11 deletions
--- a/akka-cluster-sharding/src/main/mima-filters/2.5.x.backwards.excludes
+++ b/akka-cluster-sharding/src/main/mima-filters/2.5.x.backwards.excludes
@ -4,6 +4,11 @@ ProblemFilters.exclude[Problem]("akka.cluster.sharding.Shard.*")
 # #25191
 ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.retryTask")
 # Internal API change https://github.com/akka/akka/pull/27261
 ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardCoordinator#RebalanceWorker.this")
 ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardCoordinator.rebalanceWorkerProps")
 # #27100 Productionize: GetShardRegionStats returns empty shard set on ask timeout
 # askAllShards, an internal function, was renamed and changed to query all or a subset of shards to try failures only
 ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.askAllShards")
--- a/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardCoordinator.scala
+++ b/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardCoordinator.scala
@ -424,9 +424,13 @@ object ShardCoordinator {
      shard: String,
      from: ActorRef,
      handOffTimeout: FiniteDuration,
-      regions: Set[ActorRef])
+      regions: Set[ActorRef],
-      extends Actor {
+      shuttingDownRegions: Set[ActorRef])
      extends Actor
      with ActorLogging {
    import Internal._
    shuttingDownRegions.foreach(context.watch)
    regions.foreach(_ ! BeginHandOff(shard))
    var remaining = regions
@ -435,12 +439,22 @@ object ShardCoordinator {
    def receive = {
      case BeginHandOffAck(`shard`) =>
-        remaining -= sender()
+        log.debug("BeginHandOffAck for shard [{}] received from {}.", shard, sender())
        acked(sender())
      case Terminated(shardRegion) =>
        log.debug("ShardRegion {} terminated while waiting for BeginHandOffAck for shard [{}].", shardRegion, shard)
        acked(shardRegion)
      case ReceiveTimeout => done(ok = false)
    }
    private def acked(shardRegion: ActorRef) = {
      context.unwatch(shardRegion)
      remaining -= shardRegion
      if (remaining.isEmpty) {
        log.debug("All shard regions acked, handing off shard [{}].", shard)
        from ! HandOff(shard)
        context.become(stoppingShard, discardOld = true)
      }
      case ReceiveTimeout => done(ok = false)
    }
    def stoppingShard: Receive = {
@ -458,9 +472,12 @@ object ShardCoordinator {
      shard: String,
      from: ActorRef,
      handOffTimeout: FiniteDuration,
-      regions: Set[ActorRef]): Props =
+      regions: Set[ActorRef],
-    Props(new RebalanceWorker(shard, from, handOffTimeout, regions))
+      // Note: must be a subset of regions
-
+      shuttingDownRegions: Set[ActorRef]): Props = {
    require(shuttingDownRegions.size <= regions.size, "'shuttingDownRegions' must be a subset of 'regions'.")
    Props(new RebalanceWorker(shard, from, handOffTimeout, regions, shuttingDownRegions))
  }
 }
 /**
@ -886,7 +903,8 @@ abstract class ShardCoordinator(
                shard,
                rebalanceFromRegion,
                handOffTimeout,
-                state.regions.keySet.union(state.regionProxies)).withDispatcher(context.props.dispatcher))
+                state.regions.keySet.union(state.regionProxies),
                gracefulShutdownInProgress).withDispatcher(context.props.dispatcher))
          case None =>
            log.debug("Rebalance of non-existing shard [{}] is ignored", shard)
        }