RebalanceWorker should watch shard regions (#27261)

* RebalanceWorker should watch shard regions

Fixes #27259.

The RebalanceWorker actor needs to watch the shard regions that it's
expecting a BeginHandOffAck message from, in case the ShardRegion shuts
down before it can receive the BeginHandOff message, preventing hand
off.  This can be a problem when two nodes are shut down at about the
same time.
This commit is contained in:
James Roper 2019-08-16 00:36:02 +10:00 committed by Patrik Nordwall
parent c1eb0719da
commit bbff92ade6
2 changed files with 34 additions and 11 deletions

View file

@ -4,6 +4,11 @@ ProblemFilters.exclude[Problem]("akka.cluster.sharding.Shard.*")
# #25191 # #25191
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.retryTask") ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.retryTask")
# Internal API change https://github.com/akka/akka/pull/27261
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardCoordinator#RebalanceWorker.this")
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardCoordinator.rebalanceWorkerProps")
# #27100 Productionize: GetShardRegionStats returns empty shard set on ask timeout # #27100 Productionize: GetShardRegionStats returns empty shard set on ask timeout
# askAllShards, an internal function, was renamed and changed to query all or a subset of shards to try failures only # askAllShards, an internal function, was renamed and changed to query all or a subset of shards to try failures only
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.askAllShards") ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.askAllShards")

View file

@ -424,9 +424,13 @@ object ShardCoordinator {
shard: String, shard: String,
from: ActorRef, from: ActorRef,
handOffTimeout: FiniteDuration, handOffTimeout: FiniteDuration,
regions: Set[ActorRef]) regions: Set[ActorRef],
extends Actor { shuttingDownRegions: Set[ActorRef])
extends Actor
with ActorLogging {
import Internal._ import Internal._
shuttingDownRegions.foreach(context.watch)
regions.foreach(_ ! BeginHandOff(shard)) regions.foreach(_ ! BeginHandOff(shard))
var remaining = regions var remaining = regions
@ -435,12 +439,22 @@ object ShardCoordinator {
def receive = { def receive = {
case BeginHandOffAck(`shard`) => case BeginHandOffAck(`shard`) =>
remaining -= sender() log.debug("BeginHandOffAck for shard [{}] received from {}.", shard, sender())
acked(sender())
case Terminated(shardRegion) =>
log.debug("ShardRegion {} terminated while waiting for BeginHandOffAck for shard [{}].", shardRegion, shard)
acked(shardRegion)
case ReceiveTimeout => done(ok = false)
}
private def acked(shardRegion: ActorRef) = {
context.unwatch(shardRegion)
remaining -= shardRegion
if (remaining.isEmpty) { if (remaining.isEmpty) {
log.debug("All shard regions acked, handing off shard [{}].", shard)
from ! HandOff(shard) from ! HandOff(shard)
context.become(stoppingShard, discardOld = true) context.become(stoppingShard, discardOld = true)
} }
case ReceiveTimeout => done(ok = false)
} }
def stoppingShard: Receive = { def stoppingShard: Receive = {
@ -458,9 +472,12 @@ object ShardCoordinator {
shard: String, shard: String,
from: ActorRef, from: ActorRef,
handOffTimeout: FiniteDuration, handOffTimeout: FiniteDuration,
regions: Set[ActorRef]): Props = regions: Set[ActorRef],
Props(new RebalanceWorker(shard, from, handOffTimeout, regions)) // Note: must be a subset of regions
shuttingDownRegions: Set[ActorRef]): Props = {
require(shuttingDownRegions.size <= regions.size, "'shuttingDownRegions' must be a subset of 'regions'.")
Props(new RebalanceWorker(shard, from, handOffTimeout, regions, shuttingDownRegions))
}
} }
/** /**
@ -886,7 +903,8 @@ abstract class ShardCoordinator(
shard, shard,
rebalanceFromRegion, rebalanceFromRegion,
handOffTimeout, handOffTimeout,
state.regions.keySet.union(state.regionProxies)).withDispatcher(context.props.dispatcher)) state.regions.keySet.union(state.regionProxies),
gracefulShutdownInProgress).withDispatcher(context.props.dispatcher))
case None => case None =>
log.debug("Rebalance of non-existing shard [{}] is ignored", shard) log.debug("Rebalance of non-existing shard [{}] is ignored", shard)
} }