Merge pull request #722 from akka/wip-2251-doc-cluster-fd-patriknw

Describe cluster failure detector and phi, see #2251
2012-09-21 07:45:10 -07:00 · 2012-09-21 07:45:10 -07:00 · 4ff414bd3f
commit 4ff414bd3f
parent e7a10886fb f6f81d1fd0
7 changed files with 65 additions and 14 deletions
--- a/akka-cluster/src/main/resources/reference.conf
+++ b/akka-cluster/src/main/resources/reference.conf
@ -36,9 +36,6 @@ akka {
    # how often should the node send out gossip information?
    gossip-interval = 1s
    # how often should the node send out heartbeats?
    heartbeat-interval = 1s
    # how often should the leader perform maintenance tasks?
    leader-actions-interval = 1s
@ -76,6 +73,9 @@ akka {
      # akka.cluster.ClusterSettings parameters
      implementation-class = "akka.cluster.AccrualFailureDetector"
      # how often should the node send out heartbeats?
      heartbeat-interval = 1s
      # defines the failure detector threshold
      #     A low threshold is prone to generate many wrong suspicions but ensures
      #     a quick detection in the event of a real crash. Conversely, a high
--- a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala
+++ b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala
@ -23,6 +23,7 @@ class ClusterSettings(val config: Config, val systemName: String) {
    Duration(getMilliseconds("akka.cluster.failure-detector.min-std-deviation"), MILLISECONDS)
  final val FailureDetectorAcceptableHeartbeatPause: FiniteDuration =
    Duration(getMilliseconds("akka.cluster.failure-detector.acceptable-heartbeat-pause"), MILLISECONDS)
  final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.failure-detector.heartbeat-interval"), MILLISECONDS)
  final val SeedNodes: IndexedSeq[Address] = getStringList("akka.cluster.seed-nodes").asScala.map {
    case AddressFromURIString(addr) ⇒ addr
@ -30,7 +31,6 @@ class ClusterSettings(val config: Config, val systemName: String) {
  final val SeedNodeTimeout: FiniteDuration = Duration(getMilliseconds("akka.cluster.seed-node-timeout"), MILLISECONDS)
  final val PeriodicTasksInitialDelay: FiniteDuration = Duration(getMilliseconds("akka.cluster.periodic-tasks-initial-delay"), MILLISECONDS)
  final val GossipInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.gossip-interval"), MILLISECONDS)
  final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.heartbeat-interval"), MILLISECONDS)
  final val LeaderActionsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.leader-actions-interval"), MILLISECONDS)
  final val UnreachableNodesReaperInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.unreachable-nodes-reaper-interval"), MILLISECONDS)
  final val PublishStatsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.publish-stats-interval"), MILLISECONDS)
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala
@ -8,7 +8,7 @@ import com.typesafe.config.Config
 import com.typesafe.config.ConfigFactory
 import akka.actor.{ Address, ExtendedActorSystem }
 import akka.remote.testconductor.RoleName
-import akka.remote.testkit.{STMultiNodeSpec, MultiNodeSpec}
+import akka.remote.testkit.{ STMultiNodeSpec, MultiNodeSpec }
 import akka.testkit._
 import scala.concurrent.util.duration._
 import scala.concurrent.util.Duration
@ -35,11 +35,11 @@ object MultiNodeClusterSpec {
      auto-down                           = off
      jmx.enabled                         = off
      gossip-interval                     = 200 ms
      heartbeat-interval                = 400 ms
      leader-actions-interval             = 200 ms
      unreachable-nodes-reaper-interval   = 200 ms
      periodic-tasks-initial-delay        = 300 ms
      publish-stats-interval              = 0 s # always, when it happens
      failure-detector.heartbeat-interval = 400 ms
    }
    akka.remote.log-remote-lifecycle-events = off
    akka.test {
--- a/akka-docs/rst/cluster/cluster-usage.rst
+++ b/akka-docs/rst/cluster/cluster-usage.rst
@ -214,6 +214,57 @@ frontend nodes and 3 backend nodes::
 .. note:: The above example should probably be designed as two separate, frontend/backend, clusters, when there is a `cluster client for decoupling clusters <https://www.assembla.com/spaces/akka/tickets/1165>`_.
 Failure Detector
 ^^^^^^^^^^^^^^^^
 The nodes in the cluster monitor each other by sending heartbeats to detect if a node is
 unreachable from the rest of the cluster. The heartbeat arrival times is interpreted
 by an implementation of 
 `The Phi Accrual Failure Detector <http://ddg.jaist.ac.jp/pub/HDY+04.pdf>`_. 
 The suspicion level of failure is given by a value called *phi*.
 The basic idea of the phi failure detector is to express the value of *phi* on a scale that
 is dynamically adjusted to reflect current network conditions. 
 The value of *phi* is calculated as::
  phi = -log10(1 - F(timeSinceLastHeartbeat)
 where F is the cumulative distribution function of a normal distribution with mean
 and standard deviation estimated from historical heartbeat inter-arrival times.
 In the :ref:`cluster_configuration` you can adjust the ``akka.cluster.failure-detector.threshold`` 
 to define when a *phi* value is considered to be a failure. 
 A low ``threshold`` is prone to generate many false positives but ensures
 a quick detection in the event of a real crash. Conversely, a high ``threshold``
 generates fewer mistakes but needs more time to detect actual crashes. The
 default ``threshold`` is 8 and is appropriate for most situations. However in
 cloud environments, such as Amazon EC2, the value could be increased to 12 in
 order to account for network issues that sometimes occur on such platforms.
 The following chart illustrates how *phi* increase with increasing time since the 
 previous heartbeat. 
 .. image:: images/phi1.png
 Phi is calculated from the mean and standard deviation of historical
 inter arrival times. The previous chart is an example for standard deviation
 of 200 ms. If the heartbeats arrive with less deviation the curve becomes steeper, 
 i.e. it's possible to determine failure more quickly. The curve looks like this for
 a standard deviation of 100 ms.
 .. image:: images/phi2.png
 To be able to survive sudden abnormalities, such as garbage collection pauses and
 transient network failures the failure detector is configured with a margin, 
 ``akka.cluster.failure-detector.acceptable-heartbeat-pause``. You may want to 
 adjust the :ref:`cluster_configuration` of this depending on you environment.
 This is how the curve looks like for ``acceptable-heartbeat-pause`` configured to
 3 seconds.
 .. image:: images/phi3.png
 Cluster Aware Routers
 ^^^^^^^^^^^^^^^^^^^^^
--- a/akka-docs/rst/cluster/images/phi1.png
+++ b/akka-docs/rst/cluster/images/phi1.png
--- a/akka-docs/rst/cluster/images/phi2.png
+++ b/akka-docs/rst/cluster/images/phi2.png
--- a/akka-docs/rst/cluster/images/phi3.png
+++ b/akka-docs/rst/cluster/images/phi3.png