Detect failure when no heartbeats sent, see #2907

* Subscribe to InstantMemberEvent and start heartbeating when
  InstantMemberUp. Same for metrics.
* HeartbeatNodeRing data structure for bidirectional mapping of
  heartbeat sender and receiver. Not using ConsistentHash anymore.
  Node addresses are hashed to ensure that neighbors are spread out.
* HeartbeatRequest when receiver detects that it has not received
  expected heartbeats.
* New test InitialHeartbeatSpec that simulates the problem
* Add/remove some related conf properties
* Add some more logging to be able to diagnose eventual problems
* Explicit config of nr-of-end-heartbeats
This commit is contained in:
Patrik Nordwall 2013-01-15 09:35:07 +01:00
parent c5685a0855
commit 8b4e903e7d
25 changed files with 466 additions and 146 deletions

View file

@ -351,7 +351,7 @@ object StressMultiJvmSpec extends MultiNodeConfig {
nodes foreach { node
val previous = phiByNode(node)
val φ = fd.phi(node)
if (φ > 0) {
if (φ > 0 || fd.isMonitoring(node)) {
val aboveOne = if (!φ.isInfinite && φ > 1.0) 1 else 0
phiByNode += node -> PhiValue(node, previous.countAboveOne + aboveOne, previous.count + 1,
math.max(previous.max, φ))
@ -861,7 +861,7 @@ abstract class StressSpec
name = "master-" + myself.name)
m ! Begin
import system.dispatcher
system.scheduler.scheduleOnce(highThroughputDuration) {
system.scheduler.scheduleOnce(duration) {
m.tell(End, testActor)
}
val workResult = awaitWorkResult
@ -931,7 +931,7 @@ abstract class StressSpec
"A cluster under stress" must {
"join seed nodes" taggedAs LongRunningTest in {
"join seed nodes" taggedAs LongRunningTest in within(20 seconds) {
val otherNodesJoiningSeedNodes = roles.slice(numberOfSeedNodes, numberOfSeedNodes + numberOfNodesJoiningToSeedNodesInitially)
val size = seedNodes.size + otherNodesJoiningSeedNodes.size