RebalanceWorker should watch shard regions (#27261)
* RebalanceWorker should watch shard regions Fixes #27259. The RebalanceWorker actor needs to watch the shard regions that it's expecting a BeginHandOffAck message from, in case the ShardRegion shuts down before it can receive the BeginHandOff message, preventing hand off. This can be a problem when two nodes are shut down at about the same time.
This commit is contained in:
parent
c1eb0719da
commit
bbff92ade6
2 changed files with 34 additions and 11 deletions
|
|
@ -4,6 +4,11 @@ ProblemFilters.exclude[Problem]("akka.cluster.sharding.Shard.*")
|
||||||
# #25191
|
# #25191
|
||||||
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.retryTask")
|
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.retryTask")
|
||||||
|
|
||||||
|
# Internal API change https://github.com/akka/akka/pull/27261
|
||||||
|
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardCoordinator#RebalanceWorker.this")
|
||||||
|
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardCoordinator.rebalanceWorkerProps")
|
||||||
|
|
||||||
|
|
||||||
# #27100 Productionize: GetShardRegionStats returns empty shard set on ask timeout
|
# #27100 Productionize: GetShardRegionStats returns empty shard set on ask timeout
|
||||||
# askAllShards, an internal function, was renamed and changed to query all or a subset of shards to try failures only
|
# askAllShards, an internal function, was renamed and changed to query all or a subset of shards to try failures only
|
||||||
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.askAllShards")
|
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ShardRegion.askAllShards")
|
||||||
|
|
|
||||||
|
|
@ -424,9 +424,13 @@ object ShardCoordinator {
|
||||||
shard: String,
|
shard: String,
|
||||||
from: ActorRef,
|
from: ActorRef,
|
||||||
handOffTimeout: FiniteDuration,
|
handOffTimeout: FiniteDuration,
|
||||||
regions: Set[ActorRef])
|
regions: Set[ActorRef],
|
||||||
extends Actor {
|
shuttingDownRegions: Set[ActorRef])
|
||||||
|
extends Actor
|
||||||
|
with ActorLogging {
|
||||||
import Internal._
|
import Internal._
|
||||||
|
|
||||||
|
shuttingDownRegions.foreach(context.watch)
|
||||||
regions.foreach(_ ! BeginHandOff(shard))
|
regions.foreach(_ ! BeginHandOff(shard))
|
||||||
var remaining = regions
|
var remaining = regions
|
||||||
|
|
||||||
|
|
@ -435,12 +439,22 @@ object ShardCoordinator {
|
||||||
|
|
||||||
def receive = {
|
def receive = {
|
||||||
case BeginHandOffAck(`shard`) =>
|
case BeginHandOffAck(`shard`) =>
|
||||||
remaining -= sender()
|
log.debug("BeginHandOffAck for shard [{}] received from {}.", shard, sender())
|
||||||
|
acked(sender())
|
||||||
|
case Terminated(shardRegion) =>
|
||||||
|
log.debug("ShardRegion {} terminated while waiting for BeginHandOffAck for shard [{}].", shardRegion, shard)
|
||||||
|
acked(shardRegion)
|
||||||
|
case ReceiveTimeout => done(ok = false)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def acked(shardRegion: ActorRef) = {
|
||||||
|
context.unwatch(shardRegion)
|
||||||
|
remaining -= shardRegion
|
||||||
if (remaining.isEmpty) {
|
if (remaining.isEmpty) {
|
||||||
|
log.debug("All shard regions acked, handing off shard [{}].", shard)
|
||||||
from ! HandOff(shard)
|
from ! HandOff(shard)
|
||||||
context.become(stoppingShard, discardOld = true)
|
context.become(stoppingShard, discardOld = true)
|
||||||
}
|
}
|
||||||
case ReceiveTimeout => done(ok = false)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def stoppingShard: Receive = {
|
def stoppingShard: Receive = {
|
||||||
|
|
@ -458,9 +472,12 @@ object ShardCoordinator {
|
||||||
shard: String,
|
shard: String,
|
||||||
from: ActorRef,
|
from: ActorRef,
|
||||||
handOffTimeout: FiniteDuration,
|
handOffTimeout: FiniteDuration,
|
||||||
regions: Set[ActorRef]): Props =
|
regions: Set[ActorRef],
|
||||||
Props(new RebalanceWorker(shard, from, handOffTimeout, regions))
|
// Note: must be a subset of regions
|
||||||
|
shuttingDownRegions: Set[ActorRef]): Props = {
|
||||||
|
require(shuttingDownRegions.size <= regions.size, "'shuttingDownRegions' must be a subset of 'regions'.")
|
||||||
|
Props(new RebalanceWorker(shard, from, handOffTimeout, regions, shuttingDownRegions))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -886,7 +903,8 @@ abstract class ShardCoordinator(
|
||||||
shard,
|
shard,
|
||||||
rebalanceFromRegion,
|
rebalanceFromRegion,
|
||||||
handOffTimeout,
|
handOffTimeout,
|
||||||
state.regions.keySet.union(state.regionProxies)).withDispatcher(context.props.dispatcher))
|
state.regions.keySet.union(state.regionProxies),
|
||||||
|
gracefulShutdownInProgress).withDispatcher(context.props.dispatcher))
|
||||||
case None =>
|
case None =>
|
||||||
log.debug("Rebalance of non-existing shard [{}] is ignored", shard)
|
log.debug("Rebalance of non-existing shard [{}] is ignored", shard)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue