simplify the SBR instability check (#29625)

This commit is contained in:
Patrik Nordwall 2020-09-21 16:34:21 +02:00 committed by GitHub
parent aa4a0dbcbb
commit b28d77b316
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 71 deletions

View file

@ -0,0 +1,2 @@
# change to internal
ProblemFilters.exclude[IncompatibleResultTypeProblem]("akka.cluster.sbr.DowningStrategy.setReachability")

View file

@ -205,26 +205,12 @@ import akka.coordination.lease.scaladsl.Lease
_allMembers.exists(m => m.uniqueAddress == node && m.dataCenter == selfDc) _allMembers.exists(m => m.uniqueAddress == node && m.dataCenter == selfDc)
} }
/** private[sbr] def setReachability(r: Reachability): Unit = {
* @return true if it was changed
*/
private[sbr] def setReachability(r: Reachability): Boolean = {
// skip records with Reachability.Reachable, and skip records related to other DC // skip records with Reachability.Reachable, and skip records related to other DC
val newReachability = r.filterRecords( _reachability = r.filterRecords(
record => record =>
(record.status == Reachability.Unreachable || record.status == Reachability.Terminated) && (record.status == Reachability.Unreachable || record.status == Reachability.Terminated) &&
isInSelfDc(record.observer) && isInSelfDc(record.subject)) isInSelfDc(record.observer) && isInSelfDc(record.subject))
val oldReachability = _reachability
val changed =
if (oldReachability.records.size != newReachability.records.size)
true
else
oldReachability.records.map(r => r.observer -> r.subject).toSet !=
newReachability.records.map(r => r.observer -> r.subject).toSet
_reachability = newReachability
changed
} }
def seenBy: Set[Address] = def seenBy: Set[Address] =

View file

@ -294,16 +294,17 @@ import akka.pattern.pipe
val durationSinceLatestChange = (now - reachabilityChangedStats.latestChangeTimestamp).nanos val durationSinceLatestChange = (now - reachabilityChangedStats.latestChangeTimestamp).nanos
val durationSinceFirstChange = (now - reachabilityChangedStats.firstChangeTimestamp).nanos val durationSinceFirstChange = (now - reachabilityChangedStats.firstChangeTimestamp).nanos
if (durationSinceLatestChange > (stableAfter * 2)) { val downAllWhenUnstableEnabled = downAllWhenUnstable > Duration.Zero
log.debug("SBR no reachability changes within {} ms, resetting stats", (stableAfter * 2).toMillis) if (downAllWhenUnstableEnabled && durationSinceFirstChange > (stableAfter + downAllWhenUnstable)) {
resetReachabilityChangedStats()
} else if (downAllWhenUnstable > Duration.Zero &&
durationSinceFirstChange > (stableAfter + downAllWhenUnstable)) {
log.warning( log.warning(
ClusterLogMarker.sbrInstability, ClusterLogMarker.sbrInstability,
"SBR detected instability and will down all nodes: {}", "SBR detected instability and will down all nodes: {}",
reachabilityChangedStats) reachabilityChangedStats)
actOnDecision(DownAll) actOnDecision(DownAll)
} else if (!downAllWhenUnstableEnabled && durationSinceLatestChange > (stableAfter * 2)) {
// downAllWhenUnstable is disabled but reset for meaningful logging
log.debug("SBR no reachability changes within {} ms, resetting stats", (stableAfter * 2).toMillis)
resetReachabilityChangedStats()
} }
} }
@ -470,7 +471,10 @@ import akka.pattern.pipe
log.debug("SBR unreachableMember [{}]", m) log.debug("SBR unreachableMember [{}]", m)
mutateMemberInfo(resetStable = true) { () => mutateMemberInfo(resetStable = true) { () =>
strategy.addUnreachable(m) strategy.addUnreachable(m)
updateReachabilityChangedStats()
resetReachabilityChangedStatsIfAllUnreachableDowned() resetReachabilityChangedStatsIfAllUnreachableDowned()
if (!reachabilityChangedStats.isEmpty)
log.debug("SBR noticed {}", reachabilityChangedStats)
} }
} }
} }
@ -480,19 +484,16 @@ import akka.pattern.pipe
log.debug("SBR reachableMember [{}]", m) log.debug("SBR reachableMember [{}]", m)
mutateMemberInfo(resetStable = true) { () => mutateMemberInfo(resetStable = true) { () =>
strategy.addReachable(m) strategy.addReachable(m)
updateReachabilityChangedStats()
resetReachabilityChangedStatsIfAllUnreachableDowned() resetReachabilityChangedStatsIfAllUnreachableDowned()
if (!reachabilityChangedStats.isEmpty)
log.debug("SBR noticed {}", reachabilityChangedStats)
} }
} }
} }
private[sbr] def reachabilityChanged(r: Reachability): Unit = { private[sbr] def reachabilityChanged(r: Reachability): Unit = {
if (strategy.setReachability(r)) { strategy.setReachability(r)
// resetStableDeadline is done from unreachableMember/reachableMember
updateReachabilityChangedStats()
// it may also change when members are removed and therefore the reset may be needed
resetReachabilityChangedStatsIfAllUnreachableDowned()
log.debug("SBR noticed {}", reachabilityChangedStats)
}
} }
private def updateReachabilityChangedStats(): Unit = { private def updateReachabilityChangedStats(): Unit = {

View file

@ -82,6 +82,12 @@ object SplitBrainResolverSpec {
} else if (!leader) } else if (!leader)
probe ! "down must only be done by leader" probe ! "down must only be done by leader"
} }
override def receive: Receive =
({
case UnreachableMember(m) if strategy.unreachable(m.uniqueAddress) => // already unreachable
case ReachableMember(m) if !strategy.unreachable(m.uniqueAddress) => // already reachable
}: Receive).orElse(super.receive)
} }
} }
@ -1464,7 +1470,7 @@ class SplitBrainResolverSpec
stop() stop()
} }
"down all when unstable" in new SetupKeepMajority( "down all when unstable, scenario 1" in new SetupKeepMajority(
stableAfter = 2.seconds, stableAfter = 2.seconds,
downAllWhenUnstable = 1.second, downAllWhenUnstable = 1.second,
selfUniqueAddress = memberA.uniqueAddress, selfUniqueAddress = memberA.uniqueAddress,
@ -1494,6 +1500,48 @@ class SplitBrainResolverSpec
expectDownCalled(memberA, memberB, memberC, memberD, memberE) expectDownCalled(memberA, memberB, memberC, memberD, memberE)
} }
"down all when unstable, scenario 2" in new SetupKeepMajority(
stableAfter = 2.seconds,
downAllWhenUnstable = 500.millis,
selfUniqueAddress = memberA.uniqueAddress,
role = None,
tickInterval = 100.seconds) {
memberUp(memberA, memberB, memberC, memberD, memberE)
leader(memberA)
// E and D are unreachable
reachabilityChanged(memberA -> memberE, memberB -> memberD, memberC -> memberD)
tick()
expectNoDecision(100.millis)
Thread.sleep(500)
// E and D are still unreachable
reachabilityChanged(memberA -> memberE, memberB -> memberD)
tick()
expectNoDecision(100.millis)
// 600 ms has elapsed
Thread.sleep(500)
reachabilityChanged(memberA -> memberE)
reachable(memberD) // reset stableDeadline
tick()
expectNoDecision(100.millis)
// 1200 ms has elapsed
Thread.sleep(500)
// E and D are unreachable, reset stableDeadline
reachabilityChanged(memberA -> memberE, memberB -> memberD, memberC -> memberD)
tick()
expectNoDecision(100.millis)
// 1800 ms has elapsed
Thread.sleep(1000)
// E and D are still unreachable
reachabilityChanged(memberA -> memberE, memberB -> memberD)
tick()
// 2800 ms has elapsed and still no stability so downing all
expectDownCalled(memberA, memberB, memberC, memberD, memberE)
}
"not down all when becoming stable again" in new SetupKeepMajority( "not down all when becoming stable again" in new SetupKeepMajority(
stableAfter = 2.seconds, stableAfter = 2.seconds,
downAllWhenUnstable = 1.second, downAllWhenUnstable = 1.second,
@ -1594,46 +1642,4 @@ class SplitBrainResolverSpec
} }
} }
"Reachability changes" must {
val strategy = new KeepMajority(defaultDataCenter, None)
strategy.add(memberA)
strategy.add(memberB)
strategy.add(memberC)
val memberDInOtherDC = dcMember("otherDC", memberD)
val memberEInOtherDC = dcMember("otherDC", memberE)
"be noticed when records added" in {
strategy.setReachability(createReachability(List(memberA -> memberB)))
strategy.setReachability(createReachability(List(memberA -> memberB, memberA -> memberC))) should ===(true)
}
"be noticed when records removed" in {
strategy.setReachability(createReachability(List(memberA -> memberB, memberA -> memberC)))
strategy.setReachability(createReachability(List(memberA -> memberB))) should ===(true)
strategy.setReachability(Reachability.empty) should ===(true)
}
"be noticed when records change to Reachable" in {
val r = createReachability(List(memberA -> memberB, memberA -> memberC))
strategy.setReachability(r)
strategy.setReachability(r.reachable(memberA.uniqueAddress, memberC.uniqueAddress)) should ===(true)
}
"be noticed when records added and removed" in {
strategy.setReachability(createReachability(List(memberA -> memberB)))
strategy.setReachability(createReachability(List(memberC -> memberB))) should ===(true)
}
"be ignored when records for other DC added" in {
strategy.setReachability(createReachability(List(memberA -> memberB)))
strategy.setReachability(createReachability(List(memberA -> memberB, memberA -> memberDInOtherDC))) should ===(
false)
strategy.setReachability(createReachability(List(memberA -> memberB, memberDInOtherDC -> memberB))) should ===(
false)
strategy.setReachability(createReachability(List(memberA -> memberB, memberDInOtherDC -> memberEInOtherDC))) should ===(
false)
}
}
} }