Detect failure when no heartbeats sent, see #2907
* Subscribe to InstantMemberEvent and start heartbeating when InstantMemberUp. Same for metrics. * HeartbeatNodeRing data structure for bidirectional mapping of heartbeat sender and receiver. Not using ConsistentHash anymore. Node addresses are hashed to ensure that neighbors are spread out. * HeartbeatRequest when receiver detects that it has not received expected heartbeats. * New test InitialHeartbeatSpec that simulates the problem * Add/remove some related conf properties * Add some more logging to be able to diagnose eventual problems * Explicit config of nr-of-end-heartbeats
This commit is contained in:
parent
c5685a0855
commit
8b4e903e7d
25 changed files with 466 additions and 146 deletions
|
|
@ -351,7 +351,7 @@ object StressMultiJvmSpec extends MultiNodeConfig {
|
|||
nodes foreach { node ⇒
|
||||
val previous = phiByNode(node)
|
||||
val φ = fd.phi(node)
|
||||
if (φ > 0) {
|
||||
if (φ > 0 || fd.isMonitoring(node)) {
|
||||
val aboveOne = if (!φ.isInfinite && φ > 1.0) 1 else 0
|
||||
phiByNode += node -> PhiValue(node, previous.countAboveOne + aboveOne, previous.count + 1,
|
||||
math.max(previous.max, φ))
|
||||
|
|
@ -861,7 +861,7 @@ abstract class StressSpec
|
|||
name = "master-" + myself.name)
|
||||
m ! Begin
|
||||
import system.dispatcher
|
||||
system.scheduler.scheduleOnce(highThroughputDuration) {
|
||||
system.scheduler.scheduleOnce(duration) {
|
||||
m.tell(End, testActor)
|
||||
}
|
||||
val workResult = awaitWorkResult
|
||||
|
|
@ -931,7 +931,7 @@ abstract class StressSpec
|
|||
|
||||
"A cluster under stress" must {
|
||||
|
||||
"join seed nodes" taggedAs LongRunningTest in {
|
||||
"join seed nodes" taggedAs LongRunningTest in within(20 seconds) {
|
||||
|
||||
val otherNodesJoiningSeedNodes = roles.slice(numberOfSeedNodes, numberOfSeedNodes + numberOfNodesJoiningToSeedNodesInitially)
|
||||
val size = seedNodes.size + otherNodesJoiningSeedNodes.size
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue