Change ShardCoordinator update failure logging #30608

This commit is contained in:
Muskan Gupta 2021-10-13 13:36:09 +05:30 committed by GitHub
parent 16ed5b4f64
commit 70120060b8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -1433,6 +1433,7 @@ private[akka] class DDataShardCoordinator(
private var terminating = false private var terminating = false
private var getShardHomeRequests: Set[(ActorRef, GetShardHome)] = Set.empty private var getShardHomeRequests: Set[(ActorRef, GetShardHome)] = Set.empty
private var initialStateRetries = 0 private var initialStateRetries = 0
private var updateStateRetries = 0
private val rememberEntitiesStore = private val rememberEntitiesStore =
rememberEntitiesStoreProvider.map { provider => rememberEntitiesStoreProvider.map { provider =>
@ -1572,6 +1573,7 @@ private[akka] class DDataShardCoordinator(
afterUpdateCallback: E => Unit): Receive = { afterUpdateCallback: E => Unit): Receive = {
case UpdateSuccess(CoordinatorStateKey, Some(`evt`)) => case UpdateSuccess(CoordinatorStateKey, Some(`evt`)) =>
updateStateRetries = 0
if (!waitingForRememberShard) { if (!waitingForRememberShard) {
log.debug("{}: The coordinator state was successfully updated with {}", typeName, evt) log.debug("{}: The coordinator state was successfully updated with {}", typeName, evt)
if (shardId.isDefined) timers.cancel(RememberEntitiesTimeoutKey) if (shardId.isDefined) timers.cancel(RememberEntitiesTimeoutKey)
@ -1591,19 +1593,29 @@ private[akka] class DDataShardCoordinator(
} }
case UpdateTimeout(CoordinatorStateKey, Some(`evt`)) => case UpdateTimeout(CoordinatorStateKey, Some(`evt`)) =>
log.error( updateStateRetries += 1
"{}: The ShardCoordinator was unable to update a distributed state within 'updating-state-timeout': {} millis ({}). " +
"Perhaps the ShardRegion has not started on all active nodes yet? event={}", val template = s"$typeName: The ShardCoordinator was unable to update a distributed state within 'updating-state-timeout': ${stateWriteConsistency.timeout.toMillis} millis (${if (terminating) "terminating"
typeName, else "retrying"}). Attempt $updateStateRetries. " +
stateWriteConsistency.timeout.toMillis, s"Perhaps the ShardRegion has not started on all active nodes yet? event=$evt"
if (terminating) "terminating" else "retrying",
evt) if (updateStateRetries < 5) {
log.warning(template)
if (terminating) { if (terminating) {
context.stop(self) context.stop(self)
} else { } else {
// repeat until UpdateSuccess // repeat until UpdateSuccess
sendCoordinatorStateUpdate(evt) sendCoordinatorStateUpdate(evt)
} }
} else {
log.error(template)
if (terminating) {
context.stop(self)
} else {
// repeat until UpdateSuccess
sendCoordinatorStateUpdate(evt)
}
}
case ModifyFailure(key, error, cause, _) => case ModifyFailure(key, error, cause, _) =>
log.error( log.error(