2012-12-12 11:49:20 +01:00
/* *
2013-01-09 01:47:48 +01:00
* Copyright ( C ) 2009 - 2013 Typesafe Inc . < http : //www.typesafe.com>
2012-12-12 11:49:20 +01:00
*/
package akka.cluster
import language.postfixOps
import scala.annotation.tailrec
import scala.collection.immutable
import scala.concurrent.duration._
import scala.concurrent.forkjoin.ThreadLocalRandom
2013-03-26 18:17:50 +01:00
import java.util.concurrent.atomic.AtomicReference
2012-12-12 11:49:20 +01:00
import org.scalatest.BeforeAndAfterEach
import com.typesafe.config.Config
import com.typesafe.config.ConfigFactory
import akka.actor.Actor
import akka.actor.ActorLogging
import akka.actor.ActorRef
import akka.actor.ActorSystem
import akka.actor.Address
import akka.actor.Deploy
import akka.actor.OneForOneStrategy
import akka.actor.Props
import akka.actor.RootActorPath
import akka.actor.SupervisorStrategy._
import akka.actor.Terminated
import akka.cluster.ClusterEvent.ClusterMetricsChanged
import akka.cluster.ClusterEvent.CurrentClusterState
2013-05-27 16:34:31 +02:00
import akka.cluster.ClusterEvent.CurrentInternalStats
2012-12-12 11:49:20 +01:00
import akka.cluster.ClusterEvent.MemberEvent
import akka.cluster.StandardMetrics.Cpu
import akka.cluster.StandardMetrics.HeapMemory
2013-01-29 11:55:33 +01:00
import akka.remote.DefaultFailureDetectorRegistry
import akka.remote.PhiAccrualFailureDetector
2012-12-12 11:49:20 +01:00
import akka.remote.RemoteScope
import akka.remote.testkit.MultiNodeConfig
import akka.remote.testkit.MultiNodeSpec
import akka.routing.FromConfig
import akka.testkit._
import akka.testkit.TestEvent._
2013-03-26 18:17:50 +01:00
import akka.actor.Identify
import akka.actor.ActorIdentity
2013-04-23 15:05:27 +02:00
import akka.util.Helpers.Requiring
import java.lang.management.ManagementFactory
2012-12-12 11:49:20 +01:00
/* *
* This test is intended to be used as long running stress test
* of cluster related features . Number of nodes and duration of
* the test steps can be configured . The test scenario is organized as
* follows :
* 1. join nodes in various ways up to the configured total number of nodes
* 2 while nodes are joining a few cluster aware routers are also working
* 3. exercise concurrent joining and shutdown of nodes repeatedly
* 4. exercise cluster aware routers , including high throughput
* 5. exercise many actors in a tree structure
* 6. exercise remote supervision
2013-04-23 15:05:27 +02:00
* 7. gossip without any changes to the membership
* 8. leave and shutdown nodes in various ways
* 9. while nodes are removed remote death watch is also exercised
* 10. while nodes are removed a few cluster aware routers are also working
*
* By default it uses 13 nodes .
* Example of sbt command line parameters to double that :
* `-DMultiJvm.akka.cluster.Stress.nrOfNodes=26 -Dmultinode.Dakka.test.cluster-stress-spec.nr-of-nodes-factor=2`
2012-12-12 11:49:20 +01:00
*/
2013-03-28 23:45:48 +01:00
private [ cluster ] object StressMultiJvmSpec extends MultiNodeConfig {
2012-12-12 11:49:20 +01:00
2013-04-23 15:05:27 +02:00
val totalNumberOfNodes =
System . getProperty ( "MultiJvm.akka.cluster.Stress.nrOfNodes" ) match {
case null ⇒ 13
case value ⇒ value . toInt requiring ( _ >= 10 , "nrOfNodes must be >= 10" )
}
for ( n ← 1 to totalNumberOfNodes ) role ( "node-" + n )
2012-12-12 11:49:20 +01:00
// Note that this test uses default configuration,
// not MultiNodeClusterSpec.clusterConfig
commonConfig ( ConfigFactory . parseString ( "" "
akka . test . cluster - stress - spec {
2013-04-23 15:05:27 +02:00
infolog = off
2012-12-12 11:49:20 +01:00
# scale the nr - of - nodes * settings with this factor
nr - of - nodes - factor = 1
# not scaled
nr - of - seed - nodes = 3
nr - of - nodes - joining - to - seed - initally = 2
nr - of - nodes - joining - one - by - one - small = 2
nr - of - nodes - joining - one - by - one - large = 2
nr - of - nodes - joining - to - one = 2
nr - of - nodes - leaving - one - by - one - small = 1
nr - of - nodes - leaving - one - by - one - large = 2
nr - of - nodes - leaving = 2
nr - of - nodes - shutdown - one - by - one - small = 1
nr - of - nodes - shutdown - one - by - one - large = 2
nr - of - nodes - shutdown = 2
nr - of - nodes - join - remove = 2
# not scaled
2013-02-26 08:14:09 +01:00
# scale the *- duration settings with this factor
duration - factor = 1
2012-12-12 11:49:20 +01:00
join - remove - duration = 90 s
work - batch - size = 100
work - batch - interval = 2 s
payload - size = 1000
normal - throughput - duration = 30 s
high - throughput - duration = 10 s
supervision - duration = 10 s
2013-03-08 13:26:50 +01:00
supervision - one - iteration = 2.5 s
2013-04-23 15:05:27 +02:00
idle - gossip - duration = 10 s
2013-02-28 15:45:12 +13:00
expected - test - duration = 600 s
2012-12-12 11:49:20 +01:00
# actors are created in a tree structure defined
# by tree - width ( number of children for each actor ) and
# tree - levels , total number of actors can be calculated by
# ( width * math . pow ( width , levels ) - 1 ) / ( width - 1 )
2013-08-27 15:14:53 +02:00
tree - width = 4
2012-12-12 11:49:20 +01:00
tree - levels = 4
report - metrics - interval = 10 s
# scale convergence within timeouts with this factor
convergence - within - factor = 1.0
2013-04-23 15:05:27 +02:00
# set to off to only test cluster membership
exercise - actors = on
2012-12-12 11:49:20 +01:00
}
2013-08-23 14:39:21 +02:00
akka . actor . serialize - messages = off
akka . actor . serialize - creators = off
2012-12-12 11:49:20 +01:00
akka . actor . provider = akka . cluster . ClusterActorRefProvider
akka . cluster {
2013-09-11 16:09:51 +02:00
auto - down - unreachable - after = 1 s
2013-05-27 16:34:31 +02:00
publish - stats - interval = 1 s
2012-12-12 11:49:20 +01:00
}
2013-02-01 08:02:53 +01:00
akka . loggers = [ " akka . testkit . TestEventListener " ]
2012-12-12 11:49:20 +01:00
akka . loglevel = INFO
akka . remote . log - remote - lifecycle - events = off
2013-04-23 15:05:27 +02:00
akka . actor . default - dispatcher . fork - join - executor {
parallelism - min = 8
parallelism - max = 8
}
2012-12-12 11:49:20 +01:00
akka . actor . deployment {
/ master - node - 1 / workers {
router = round - robin
nr - of - instances = 100
cluster {
enabled = on
max - nr - of - instances - per - node = 1
allow - local - routees = off
}
}
/ master - node - 2 / workers {
router = round - robin
nr - of - instances = 100
cluster {
enabled = on
routees - path = "/user/worker"
allow - local - routees = off
}
}
/ master - node - 3 / workers = {
router = adaptive
nr - of - instances = 100
cluster {
enabled = on
max - nr - of - instances - per - node = 1
allow - local - routees = off
}
}
}
"" " ) )
class Settings ( conf : Config ) {
private val testConfig = conf . getConfig ( "akka.test.cluster-stress-spec" )
import testConfig._
private def getDuration ( name : String ) : FiniteDuration = Duration ( getMilliseconds ( name ) , MILLISECONDS )
2013-04-23 15:05:27 +02:00
val infolog = getBoolean ( "infolog" )
2012-12-12 11:49:20 +01:00
val nFactor = getInt ( "nr-of-nodes-factor" )
val numberOfSeedNodes = getInt ( "nr-of-seed-nodes" ) // not scaled by nodes factor
val numberOfNodesJoiningToSeedNodesInitially = getInt ( "nr-of-nodes-joining-to-seed-initally" ) * nFactor
val numberOfNodesJoiningOneByOneSmall = getInt ( "nr-of-nodes-joining-one-by-one-small" ) * nFactor
val numberOfNodesJoiningOneByOneLarge = getInt ( "nr-of-nodes-joining-one-by-one-large" ) * nFactor
val numberOfNodesJoiningToOneNode = getInt ( "nr-of-nodes-joining-to-one" ) * nFactor
2013-04-23 15:05:27 +02:00
// remaining will join to seed nodes
val numberOfNodesJoiningToSeedNodes = ( totalNumberOfNodes - numberOfSeedNodes -
numberOfNodesJoiningToSeedNodesInitially - numberOfNodesJoiningOneByOneSmall -
numberOfNodesJoiningOneByOneLarge - numberOfNodesJoiningToOneNode ) requiring ( _ >= 0 ,
s" too many configured nr-of-nodes-joining-*, total must be <= ${ totalNumberOfNodes } " )
2012-12-12 11:49:20 +01:00
val numberOfNodesLeavingOneByOneSmall = getInt ( "nr-of-nodes-leaving-one-by-one-small" ) * nFactor
val numberOfNodesLeavingOneByOneLarge = getInt ( "nr-of-nodes-leaving-one-by-one-large" ) * nFactor
val numberOfNodesLeaving = getInt ( "nr-of-nodes-leaving" ) * nFactor
val numberOfNodesShutdownOneByOneSmall = getInt ( "nr-of-nodes-shutdown-one-by-one-small" ) * nFactor
val numberOfNodesShutdownOneByOneLarge = getInt ( "nr-of-nodes-shutdown-one-by-one-large" ) * nFactor
val numberOfNodesShutdown = getInt ( "nr-of-nodes-shutdown" ) * nFactor
val numberOfNodesJoinRemove = getInt ( "nr-of-nodes-join-remove" ) // not scaled by nodes factor
val workBatchSize = getInt ( "work-batch-size" )
val workBatchInterval = Duration ( getMilliseconds ( "work-batch-interval" ) , MILLISECONDS )
val payloadSize = getInt ( "payload-size" )
val dFactor = getInt ( "duration-factor" )
val joinRemoveDuration = getDuration ( "join-remove-duration" ) * dFactor
val normalThroughputDuration = getDuration ( "normal-throughput-duration" ) * dFactor
val highThroughputDuration = getDuration ( "high-throughput-duration" ) * dFactor
val supervisionDuration = getDuration ( "supervision-duration" ) * dFactor
2013-02-26 08:14:09 +01:00
val supervisionOneIteration = getDuration ( "supervision-one-iteration" ) * dFactor
2013-04-23 15:05:27 +02:00
val idleGossipDuration = getDuration ( "idle-gossip-duration" ) * dFactor
2013-02-28 15:45:12 +13:00
val expectedTestDuration = getDuration ( "expected-test-duration" ) * dFactor
2012-12-12 11:49:20 +01:00
val treeWidth = getInt ( "tree-width" )
val treeLevels = getInt ( "tree-levels" )
val reportMetricsInterval = getDuration ( "report-metrics-interval" )
val convergenceWithinFactor = getDouble ( "convergence-within-factor" )
2013-04-23 15:05:27 +02:00
val exerciseActors = getBoolean ( "exercise-actors" )
2012-12-12 11:49:20 +01:00
require ( numberOfSeedNodes + numberOfNodesJoiningToSeedNodesInitially + numberOfNodesJoiningOneByOneSmall +
numberOfNodesJoiningOneByOneLarge + numberOfNodesJoiningToOneNode + numberOfNodesJoiningToSeedNodes <= totalNumberOfNodes ,
s" specified number of joining nodes <= ${ totalNumberOfNodes } " )
// don't shutdown the 3 nodes hosting the master actors
require ( numberOfNodesLeavingOneByOneSmall + numberOfNodesLeavingOneByOneLarge + numberOfNodesLeaving +
numberOfNodesShutdownOneByOneSmall + numberOfNodesShutdownOneByOneLarge + numberOfNodesShutdown <= totalNumberOfNodes - 3 ,
s" specified number of leaving/shutdown nodes <= ${ totalNumberOfNodes - 3 } " )
require ( numberOfNodesJoinRemove <= totalNumberOfNodes , s" nr-of-nodes-join-remove must be <= ${ totalNumberOfNodes } " )
2013-04-23 15:05:27 +02:00
override def toString : String = {
testConfig . withFallback ( ConfigFactory . parseString ( s" nrOfNodes= ${ totalNumberOfNodes } " ) ) . root . render
}
}
2012-12-12 11:49:20 +01:00
implicit class FormattedDouble ( val d : Double ) extends AnyVal {
def form : String = d . formatted ( "%.2f" )
}
case class ClusterResult (
address : Address ,
duration : Duration ,
2013-05-27 16:34:31 +02:00
clusterStats : GossipStats )
2012-12-12 11:49:20 +01:00
2013-05-27 16:34:31 +02:00
case class AggregatedClusterResult ( title : String , duration : Duration , clusterStats : GossipStats )
2012-12-12 11:49:20 +01:00
/* *
* Central aggregator of cluster statistics and metrics .
* Reports the result via log periodically and when all
* expected results has been collected . It shuts down
* itself when expected results has been collected .
*/
2013-04-22 20:39:32 +02:00
class ClusterResultAggregator ( title : String , expectedResults : Int , settings : Settings ) extends Actor with ActorLogging {
import settings.reportMetricsInterval
2013-04-23 15:05:27 +02:00
import settings.infolog
2012-12-12 11:49:20 +01:00
val cluster = Cluster ( context . system )
var reportTo : Option [ ActorRef ] = None
var results = Vector . empty [ ClusterResult ]
var nodeMetrics = Set . empty [ NodeMetrics ]
var phiValuesObservedByNode = {
import akka.cluster.Member.addressOrdering
2013-03-06 13:11:46 +01:00
immutable . SortedMap . empty [ Address , immutable . SortedSet [ PhiValue ] ]
2012-12-12 11:49:20 +01:00
}
2013-03-05 12:49:35 +01:00
var clusterStatsObservedByNode = {
import akka.cluster.Member.addressOrdering
2013-05-27 16:34:31 +02:00
immutable . SortedMap . empty [ Address , CurrentInternalStats ]
2013-03-05 12:49:35 +01:00
}
2012-12-12 11:49:20 +01:00
import context.dispatcher
val reportMetricsTask = context . system . scheduler . schedule (
reportMetricsInterval , reportMetricsInterval , self , ReportTick )
// subscribe to ClusterMetricsChanged, re-subscribe when restart
override def preStart ( ) : Unit = cluster . subscribe ( self , classOf [ ClusterMetricsChanged ] )
override def postStop ( ) : Unit = {
cluster . unsubscribe ( self )
reportMetricsTask . cancel ( )
super . postStop ( )
}
def receive = {
case ClusterMetricsChanged ( clusterMetrics ) ⇒ nodeMetrics = clusterMetrics
case PhiResult ( from , phiValues ) ⇒ phiValuesObservedByNode += from -> phiValues
2013-03-06 13:11:46 +01:00
case StatsResult ( from , stats ) ⇒ clusterStatsObservedByNode += from -> stats
2012-12-12 11:49:20 +01:00
case ReportTick ⇒
2013-04-23 15:05:27 +02:00
if ( infolog )
2013-04-22 20:39:32 +02:00
log . info ( s" [ ${ title } ] in progress \n ${ formatMetrics } \n \n ${ formatPhi } \n \n ${ formatStats } " )
2012-12-12 11:49:20 +01:00
case r : ClusterResult ⇒
results : += r
if ( results . size == expectedResults ) {
2013-05-27 16:34:31 +02:00
val aggregated = AggregatedClusterResult ( title , maxDuration , totalGossipStats )
2013-04-23 15:05:27 +02:00
if ( infolog )
2013-04-22 20:39:32 +02:00
log . info ( s" [ ${ title } ] completed in [ ${ aggregated . duration . toMillis } ] ms \n ${ aggregated . clusterStats } \n ${ formatMetrics } \n \n ${ formatPhi } \n \n ${ formatStats } " )
2012-12-12 11:49:20 +01:00
reportTo foreach { _ ! aggregated }
context stop self
}
case _ : CurrentClusterState ⇒
case ReportTo ( ref ) ⇒ reportTo = ref
}
def maxDuration = results . map ( _ . duration ) . max
2013-05-27 16:34:31 +02:00
def totalGossipStats = results . foldLeft ( GossipStats ( ) ) { _ : + _ . clusterStats }
2012-12-12 11:49:20 +01:00
def formatMetrics : String = {
import akka.cluster.Member.addressOrdering
( formatMetricsHeader +: ( nodeMetrics . toSeq . sortBy ( _ . address ) map formatMetricsLine ) ) . mkString ( "\n" )
}
2013-04-23 15:05:27 +02:00
def formatMetricsHeader : String = "[Node]\t[Heap (MB)]\t[CPU (%)]\t[Load]"
2012-12-12 11:49:20 +01:00
def formatMetricsLine ( nodeMetrics : NodeMetrics ) : String = {
val heap = nodeMetrics match {
case HeapMemory ( address , timestamp , used , committed , max ) ⇒
( used . doubleValue / 1024 / 1024 ) . form
case _ ⇒ ""
}
val cpuAndLoad = nodeMetrics match {
case Cpu ( address , timestamp , loadOption , cpuOption , processors ) ⇒
format ( cpuOption ) + "\t" + format ( loadOption )
case _ ⇒ "N/A\tN/A"
}
s" ${ nodeMetrics . address } \t ${ heap } \t ${ cpuAndLoad } "
}
def format ( opt : Option [ Double ] ) = opt match {
case None ⇒ "N/A"
case Some ( x ) ⇒ x . form
}
def formatPhi : String = {
if ( phiValuesObservedByNode . isEmpty ) ""
else {
import akka.cluster.Member.addressOrdering
val lines =
for {
2013-03-06 13:11:46 +01:00
( monitor , phiValues ) ← phiValuesObservedByNode
phi ← phiValues
2012-12-12 11:49:20 +01:00
} yield formatPhiLine ( monitor , phi . address , phi )
2013-03-06 13:11:46 +01:00
lines . mkString ( formatPhiHeader + "\n" , "\n" , "" )
2012-12-12 11:49:20 +01:00
}
}
2013-04-23 15:05:27 +02:00
def formatPhiHeader : String = "[Monitor]\t[Subject]\t[count]\t[count phi > 1.0]\t[max phi]"
2012-12-12 11:49:20 +01:00
def formatPhiLine ( monitor : Address , subject : Address , phi : PhiValue ) : String =
s" ${ monitor } \t ${ subject } \t ${ phi . count } \t ${ phi . countAboveOne } \t ${ phi . max . form } "
2013-05-27 16:34:31 +02:00
def formatStats : String = {
def f ( stats : CurrentInternalStats ) = {
import stats.gossipStats._
import stats.vclockStats._
s" ClusterStats( $receivedGossipCount , $mergeCount , $sameCount , $newerCount , $olderCount , $versionSize , $seenLatest ) "
}
( clusterStatsObservedByNode map { case ( monitor , stats ) ⇒ s" ${ monitor } \t ${ f ( stats ) } " } ) .
mkString ( "ClusterStats(gossip, merge, same, newer, older, vclockSize, seenLatest)\n" , "\n" , "" )
}
2012-12-12 11:49:20 +01:00
}
/* *
* Keeps cluster statistics and metrics reported by
* ClusterResultAggregator . Logs the list of historical
* results when a new AggregatedClusterResult is received .
*/
class ClusterResultHistory extends Actor with ActorLogging {
var history = Vector . empty [ AggregatedClusterResult ]
def receive = {
case result : AggregatedClusterResult ⇒
history : += result
log . info ( "Cluster result history\n" + formatHistory )
}
def formatHistory : String =
( formatHistoryHeader +: ( history map formatHistoryLine ) ) . mkString ( "\n" )
2013-05-27 16:34:31 +02:00
def formatHistoryHeader : String = "[Title]\t[Duration (ms)]\t[GossipStats(gossip, merge, same, newer, older)]"
2012-12-12 11:49:20 +01:00
def formatHistoryLine ( result : AggregatedClusterResult ) : String =
2013-03-05 12:49:35 +01:00
s" ${ result . title } \t ${ result . duration . toMillis } \t ${ result . clusterStats } "
2012-12-12 11:49:20 +01:00
}
/* *
* Collect phi values of the failure detector and report to the
* central ClusterResultAggregator .
*/
class PhiObserver extends Actor with ActorLogging {
val cluster = Cluster ( context . system )
var reportTo : Option [ ActorRef ] = None
val emptyPhiByNode = Map . empty [ Address , PhiValue ] . withDefault ( address ⇒ PhiValue ( address , 0 , 0 , 0.0 ) )
var phiByNode = emptyPhiByNode
var nodes = Set . empty [ Address ]
2013-01-29 11:55:33 +01:00
def phi ( address : Address ) : Double = cluster . failureDetector match {
case reg : DefaultFailureDetectorRegistry [ Address ] ⇒ reg . failureDetector ( address ) match {
case Some ( fd : PhiAccrualFailureDetector ) ⇒ fd . phi
case _ ⇒ 0.0
}
case _ ⇒ 0.0
}
2012-12-12 11:49:20 +01:00
import context.dispatcher
val checkPhiTask = context . system . scheduler . schedule (
1. second , 1. second , self , PhiTick )
// subscribe to MemberEvent, re-subscribe when restart
override def preStart ( ) : Unit = cluster . subscribe ( self , classOf [ MemberEvent ] )
override def postStop ( ) : Unit = {
cluster . unsubscribe ( self )
checkPhiTask . cancel ( )
super . postStop ( )
}
def receive = {
case PhiTick ⇒
nodes foreach { node ⇒
val previous = phiByNode ( node )
2013-01-29 11:55:33 +01:00
val φ = phi ( node )
if ( φ > 0 || cluster . failureDetector . isMonitoring ( node ) ) {
2012-12-12 11:49:20 +01:00
val aboveOne = if ( ! φ . isInfinite && φ > 1.0 ) 1 else 0
phiByNode += node -> PhiValue ( node , previous . countAboveOne + aboveOne , previous . count + 1 ,
math . max ( previous . max , φ ) )
}
}
2013-03-06 13:11:46 +01:00
val phiSet = immutable . SortedSet . empty [ PhiValue ] ++ phiByNode . values
reportTo foreach { _ ! PhiResult ( cluster . selfAddress , phiSet ) }
2012-12-12 11:49:20 +01:00
case state : CurrentClusterState ⇒ nodes = state . members . map ( _ . address )
case memberEvent : MemberEvent ⇒ nodes += memberEvent . member . address
2013-04-23 15:05:27 +02:00
case ReportTo ( ref ) ⇒
reportTo foreach context . unwatch
reportTo = ref
reportTo foreach context . watch
case Terminated ( ref ) ⇒
reportTo match {
case Some ( `ref` ) ⇒ reportTo = None
case _ ⇒
}
2012-12-12 11:49:20 +01:00
case Reset ⇒
phiByNode = emptyPhiByNode
nodes = Set . empty [ Address ]
cluster . unsubscribe ( self )
cluster . subscribe ( self , classOf [ MemberEvent ] )
}
}
2013-03-05 12:49:35 +01:00
class StatsObserver extends Actor {
val cluster = Cluster ( context . system )
var reportTo : Option [ ActorRef ] = None
2013-05-27 16:34:31 +02:00
var startStats : Option [ GossipStats ] = None
2013-03-05 12:49:35 +01:00
2013-05-27 16:34:31 +02:00
override def preStart ( ) : Unit = cluster . subscribe ( self , classOf [ CurrentInternalStats ] )
override def postStop ( ) : Unit = cluster . unsubscribe ( self )
2013-03-05 12:49:35 +01:00
def receive = {
2013-05-27 16:34:31 +02:00
case CurrentInternalStats ( gossipStats , vclockStats ) ⇒
val diff = startStats match {
case None ⇒ { startStats = Some ( gossipStats ) ; gossipStats }
case Some ( start ) ⇒ gossipStats : - start
}
val res = StatsResult ( cluster . selfAddress , CurrentInternalStats ( diff , vclockStats ) )
2013-03-13 16:01:57 +01:00
reportTo foreach { _ ! res }
2013-03-05 12:49:35 +01:00
case ReportTo ( ref ) ⇒
2013-04-23 15:05:27 +02:00
reportTo foreach context . unwatch
2013-03-05 12:49:35 +01:00
reportTo = ref
2013-04-23 15:05:27 +02:00
reportTo foreach context . watch
case Terminated ( ref ) ⇒
reportTo match {
case Some ( `ref` ) ⇒ reportTo = None
case _ ⇒
}
2013-03-05 12:49:35 +01:00
case Reset ⇒
2013-05-27 16:34:31 +02:00
startStats = None
2013-06-25 22:10:05 +02:00
case _ : CurrentClusterState ⇒ // not interesting here
2013-03-05 12:49:35 +01:00
}
}
2012-12-12 11:49:20 +01:00
/* *
* Master of routers
*
* Flow control , to not flood the consumers , is handled by scheduling a
* batch of messages to be sent to the router when half of the number
* of outstanding messages remains .
*
* It uses a simple message retry mechanism . If an ack of a sent message
* is not received within a timeout , that message will be resent to the router ,
* infinite number of times .
*
* When it receives the `End` command it will stop sending messages to the router ,
* resends continuous , until all outstanding acks have been received , and then
* finally it replies with `WorkResult` to the sender of the `End` command , and stops
* itself .
*/
class Master ( settings : StressMultiJvmSpec . Settings , batchInterval : FiniteDuration , tree : Boolean ) extends Actor {
val workers = context . actorOf ( Props [ Worker ] . withRouter ( FromConfig ) , "workers" )
val payload = Array . fill ( settings . payloadSize ) ( ThreadLocalRandom . current . nextInt ( 127 ) . toByte )
val retryTimeout = 5. seconds . dilated ( context . system )
val idCounter = Iterator from 0
var sendCounter = 0L
var ackCounter = 0L
var outstanding = Map . empty [ JobId , JobState ]
var startTime = 0L
import context.dispatcher
val resendTask = context . system . scheduler . schedule ( 3. seconds , 3. seconds , self , RetryTick )
override def postStop ( ) : Unit = {
resendTask . cancel ( )
super . postStop ( )
}
def receive = {
case Begin ⇒
startTime = System . nanoTime
self ! SendBatch
context . become ( working )
case RetryTick ⇒
}
def working : Receive = {
case Ack ( id ) ⇒
outstanding -= id
ackCounter += 1
if ( outstanding . size == settings . workBatchSize / 2 )
if ( batchInterval == Duration . Zero ) self ! SendBatch
else context . system . scheduler . scheduleOnce ( batchInterval , self , SendBatch )
case SendBatch ⇒ sendJobs ( )
case RetryTick ⇒ resend ( )
case End ⇒
done ( sender )
context . become ( ending ( sender ) )
}
def ending ( replyTo : ActorRef ) : Receive = {
case Ack ( id ) ⇒
outstanding -= id
ackCounter += 1
done ( replyTo )
case SendBatch ⇒
case RetryTick ⇒ resend ( )
}
2013-01-17 14:00:01 +01:00
def done ( replyTo : ActorRef ) : Unit =
if ( outstanding . isEmpty ) {
val duration = ( System . nanoTime - startTime ) . nanos
replyTo ! WorkResult ( duration , sendCounter , ackCounter )
context stop self
}
2012-12-12 11:49:20 +01:00
def sendJobs ( ) : Unit = {
0 until settings . workBatchSize foreach { _ ⇒
send ( createJob ( ) )
}
}
def createJob ( ) : Job = {
if ( tree ) TreeJob ( idCounter . next ( ) , payload , ThreadLocalRandom . current . nextInt ( settings . treeWidth ) ,
settings . treeLevels , settings . treeWidth )
else SimpleJob ( idCounter . next ( ) , payload )
}
def resend ( ) : Unit = {
outstanding . values foreach { jobState ⇒
if ( jobState . deadline . isOverdue )
send ( jobState . job )
}
}
def send ( job : Job ) : Unit = {
outstanding += job . id -> JobState ( Deadline . now + retryTimeout , job )
sendCounter += 1
workers ! job
}
}
/* *
* Used by Master as routee
*/
class Worker extends Actor with ActorLogging {
def receive = {
case SimpleJob ( id , payload ) ⇒ sender ! Ack ( id )
case TreeJob ( id , payload , idx , levels , width ) ⇒
// create the actors when first TreeJob message is received
val totalActors = ( ( width * math . pow ( width , levels ) - 1 ) / ( width - 1 ) ) . toInt
2013-04-23 15:05:27 +02:00
log . debug ( "Creating [{}] actors in a tree structure of [{}] levels and each actor has [{}] children" ,
2012-12-12 11:49:20 +01:00
totalActors , levels , width )
2013-04-26 13:54:10 +02:00
val tree = context . actorOf ( Props ( classOf [ TreeNode ] , levels , width ) , "tree" )
2013-03-28 23:45:48 +01:00
tree forward ( ( idx , SimpleJob ( id , payload ) ) )
2012-12-12 11:49:20 +01:00
context . become ( treeWorker ( tree ) )
}
def treeWorker ( tree : ActorRef ) : Receive = {
case SimpleJob ( id , payload ) ⇒ sender ! Ack ( id )
case TreeJob ( id , payload , idx , _ , _ ) ⇒
2013-03-28 23:45:48 +01:00
tree forward ( ( idx , SimpleJob ( id , payload ) ) )
2012-12-12 11:49:20 +01:00
}
}
class TreeNode ( level : Int , width : Int ) extends Actor {
require ( level >= 1 )
def createChild ( ) : Actor = if ( level == 1 ) new Leaf else new TreeNode ( level - 1 , width )
val indexedChildren =
2013-05-29 16:13:10 +02:00
0 until width map { i ⇒ context . actorOf ( Props ( createChild ( ) ) . withDeploy ( Deploy . local ) , name = i . toString ) } toVector
2012-12-12 11:49:20 +01:00
def receive = {
2013-03-28 23:45:48 +01:00
case ( idx : Int , job : SimpleJob ) if idx < width ⇒ indexedChildren ( idx ) forward ( ( idx , job ) )
2012-12-12 11:49:20 +01:00
}
}
class Leaf extends Actor {
def receive = {
case ( _ : Int , job : SimpleJob ) ⇒ sender ! Ack ( job . id )
}
}
/* *
* Used for remote death watch testing
*/
class Watchee extends Actor {
2013-03-26 18:17:50 +01:00
def receive = Actor . emptyBehavior
2012-12-12 11:49:20 +01:00
}
/* *
* Used for remote supervision testing
*/
class Supervisor extends Actor {
var restartCount = 0
override val supervisorStrategy =
OneForOneStrategy ( maxNrOfRetries = 5 , withinTimeRange = 1 minute ) {
case _ : Exception ⇒
restartCount += 1
Restart
}
def receive = {
case props : Props ⇒ context . actorOf ( props )
case e : Exception ⇒ context . children foreach { _ ! e }
case GetChildrenCount ⇒ sender ! ChildrenCount ( context . children . size , restartCount )
case Reset ⇒
require ( context . children . isEmpty ,
s" ResetChildrenCount not allowed when children exists, [ ${ context . children . size } ] " )
restartCount = 0
}
}
/* *
* Child of Supervisor for remote supervision testing
*/
class RemoteChild extends Actor {
def receive = {
case e : Exception ⇒ throw e
}
}
case object Begin
case object End
case object RetryTick
case object ReportTick
case object PhiTick
2013-03-06 13:11:46 +01:00
case class PhiResult ( from : Address , phiValues : immutable.SortedSet [ PhiValue ] )
case class PhiValue ( address : Address , countAboveOne : Int , count : Int , max : Double ) extends Ordered [ PhiValue ] {
import akka.cluster.Member.addressOrdering
def compare ( that : PhiValue ) = addressOrdering . compare ( this . address , that . address )
}
2012-12-12 11:49:20 +01:00
case class ReportTo ( ref : Option [ ActorRef ] )
2013-05-27 16:34:31 +02:00
case class StatsResult ( from : Address , stats : CurrentInternalStats )
2012-12-12 11:49:20 +01:00
type JobId = Int
trait Job { def id : JobId }
case class SimpleJob ( id : JobId , payload : Any ) extends Job
case class TreeJob ( id : JobId , payload : Any , idx : Int , levels : Int , width : Int ) extends Job
case class Ack ( id : JobId )
case class JobState ( deadline : Deadline , job : Job )
case class WorkResult ( duration : Duration , sendCount : Long , ackCount : Long ) {
def retryCount : Long = sendCount - ackCount
def jobsPerSecond : Double = ackCount * 1000.0 / duration . toMillis
}
case object SendBatch
case class CreateTree ( levels : Int , width : Int )
case object GetChildrenCount
case class ChildrenCount ( numberOfChildren : Int , numberOfChildRestarts : Int )
case object Reset
}
class StressMultiJvmNode1 extends StressSpec
class StressMultiJvmNode2 extends StressSpec
class StressMultiJvmNode3 extends StressSpec
class StressMultiJvmNode4 extends StressSpec
class StressMultiJvmNode5 extends StressSpec
class StressMultiJvmNode6 extends StressSpec
class StressMultiJvmNode7 extends StressSpec
class StressMultiJvmNode8 extends StressSpec
class StressMultiJvmNode9 extends StressSpec
class StressMultiJvmNode10 extends StressSpec
class StressMultiJvmNode11 extends StressSpec
class StressMultiJvmNode12 extends StressSpec
class StressMultiJvmNode13 extends StressSpec
abstract class StressSpec
extends MultiNodeSpec ( StressMultiJvmSpec )
with MultiNodeClusterSpec with BeforeAndAfterEach with ImplicitSender {
import StressMultiJvmSpec._
import ClusterEvent._
val settings = new Settings ( system . settings . config )
import settings._
2013-03-26 18:17:50 +01:00
val identifyProbe = TestProbe ( )
2012-12-12 11:49:20 +01:00
var step = 0
var nbrUsedRoles = 0
override def beforeEach ( ) : Unit = { step += 1 }
2013-02-28 15:45:12 +13:00
override def expectedTestDuration = settings . expectedTestDuration
2013-04-23 15:05:27 +02:00
override def shutdownTimeout : FiniteDuration = 30. seconds . dilated
2012-12-12 11:49:20 +01:00
override def muteLog ( sys : ActorSystem = system ) : Unit = {
super . muteLog ( sys )
sys . eventStream . publish ( Mute ( EventFilter [ RuntimeException ] ( pattern = ".*Simulated exception.*" ) ) )
2013-04-23 15:05:27 +02:00
muteDeadLetters ( classOf [ SimpleJob ] , classOf [ AggregatedClusterResult ] , SendBatch . getClass ,
classOf [ StatsResult ] , classOf [ PhiResult ] , RetryTick . getClass ) ( sys )
}
def jvmInfo ( ) : String = {
val runtime = ManagementFactory . getRuntimeMXBean
val os = ManagementFactory . getOperatingSystemMXBean
val threads = ManagementFactory . getThreadMXBean
val mem = ManagementFactory . getMemoryMXBean
val heap = mem . getHeapMemoryUsage
val sb = new StringBuilder
sb . append ( "Operating system: " ) . append ( os . getName ) . append ( ", " ) . append ( os . getArch ) . append ( ", " ) . append ( os . getVersion )
sb . append ( "\n" )
sb . append ( "JVM: " ) . append ( runtime . getVmName ) . append ( " " ) . append ( runtime . getVmVendor ) .
append ( " " ) . append ( runtime . getVmVersion )
sb . append ( "\n" )
sb . append ( "Processors: " ) . append ( os . getAvailableProcessors )
sb . append ( "\n" )
sb . append ( "Load average: " ) . append ( os . getSystemLoadAverage )
sb . append ( "\n" )
sb . append ( "Thread count: " ) . append ( threads . getThreadCount ) . append ( " (" ) . append ( threads . getPeakThreadCount ) . append ( ")" )
sb . append ( "\n" )
sb . append ( "Heap: " ) . append ( ( heap . getUsed . toDouble / 1024 / 1024 ) . form ) .
append ( " (" ) . append ( ( heap . getInit . toDouble / 1024 / 1024 ) . form ) .
append ( " - " ) .
append ( ( heap . getMax . toDouble / 1024 / 1024 ) . form ) .
append ( ")" ) . append ( " MB" )
sb . append ( "\n" )
import scala.collection.JavaConverters._
val args = runtime . getInputArguments . asScala . filterNot ( _ . contains ( "classpath" ) ) . mkString ( "\n " )
sb . append ( "Args:\n " ) . append ( args )
sb . append ( "\n" )
sb . toString
2012-12-12 11:49:20 +01:00
}
val seedNodes = roles . take ( numberOfSeedNodes )
2013-05-27 16:34:31 +02:00
def latestGossipStats = cluster . readView . latestStats . gossipStats
2012-12-12 11:49:20 +01:00
override def cluster : Cluster = {
createWorker
super . cluster
}
// always create one worker when the cluster is started
lazy val createWorker : Unit =
system . actorOf ( Props [ Worker ] , "worker" )
def createResultAggregator ( title : String , expectedResults : Int , includeInHistory : Boolean ) : Unit = {
runOn ( roles . head ) {
2013-05-29 16:13:10 +02:00
val aggregator = system . actorOf ( Props ( classOf [ ClusterResultAggregator ] , title , expectedResults , settings ) . withDeploy ( Deploy . local ) ,
2012-12-12 11:49:20 +01:00
name = "result" + step )
2013-04-23 15:05:27 +02:00
if ( includeInHistory && infolog ) aggregator ! ReportTo ( Some ( clusterResultHistory ) )
2012-12-12 11:49:20 +01:00
else aggregator ! ReportTo ( None )
}
enterBarrier ( "result-aggregator-created-" + step )
runOn ( roles . take ( nbrUsedRoles ) : _ * ) {
2013-03-26 18:17:50 +01:00
phiObserver ! ReportTo ( clusterResultAggregator )
2013-03-05 12:49:35 +01:00
statsObserver ! Reset
2013-03-26 18:17:50 +01:00
statsObserver ! ReportTo ( clusterResultAggregator )
2012-12-12 11:49:20 +01:00
}
}
2013-03-26 18:17:50 +01:00
def clusterResultAggregator : Option [ ActorRef ] = {
system . actorSelection ( node ( roles . head ) / "user" / ( "result" + step ) ) . tell ( Identify ( step ) , identifyProbe . ref )
identifyProbe . expectMsgType [ ActorIdentity ] . ref
}
2012-12-12 11:49:20 +01:00
2013-04-22 20:39:32 +02:00
lazy val clusterResultHistory =
2013-04-23 15:05:27 +02:00
if ( settings . infolog ) system . actorOf ( Props [ ClusterResultHistory ] , "resultHistory" )
2013-04-22 20:39:32 +02:00
else system . deadLetters
2012-12-12 11:49:20 +01:00
lazy val phiObserver = system . actorOf ( Props [ PhiObserver ] , "phiObserver" )
2013-03-05 12:49:35 +01:00
lazy val statsObserver = system . actorOf ( Props [ StatsObserver ] , "statsObserver" )
2013-03-28 23:45:48 +01:00
def awaitClusterResult ( ) : Unit = {
2012-12-12 11:49:20 +01:00
runOn ( roles . head ) {
2013-03-26 18:17:50 +01:00
clusterResultAggregator match {
case Some ( r ) ⇒
watch ( r )
expectMsgPF ( remaining ) { case Terminated ( a ) if a . path == r . path ⇒ true }
case None ⇒ // ok, already terminated
}
2012-12-12 11:49:20 +01:00
}
enterBarrier ( "cluster-result-done-" + step )
}
def joinOneByOne ( numberOfNodes : Int ) : Unit = {
0 until numberOfNodes foreach { _ ⇒
joinOne ( )
nbrUsedRoles += 1
step += 1
}
}
def convergenceWithin ( base : FiniteDuration , nodes : Int ) : FiniteDuration =
( base . toMillis * convergenceWithinFactor * nodes ) . millis
def joinOne ( ) : Unit = within ( 5. seconds + convergenceWithin ( 2. seconds , nbrUsedRoles + 1 ) ) {
val currentRoles = roles . take ( nbrUsedRoles + 1 )
val title = s" join one to ${ nbrUsedRoles } nodes cluster "
createResultAggregator ( title , expectedResults = currentRoles . size , includeInHistory = true )
runOn ( currentRoles : _ * ) {
reportResult {
runOn ( currentRoles . last ) {
cluster . join ( roles . head )
}
2013-03-05 21:05:11 +01:00
awaitMembersUp ( currentRoles . size , timeout = remaining )
2012-12-12 11:49:20 +01:00
}
}
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2012-12-12 11:49:20 +01:00
enterBarrier ( "join-one-" + step )
}
def joinSeveral ( numberOfNodes : Int , toSeedNodes : Boolean ) : Unit =
within ( 10. seconds + convergenceWithin ( 3. seconds , nbrUsedRoles + numberOfNodes ) ) {
val currentRoles = roles . take ( nbrUsedRoles + numberOfNodes )
val joiningRoles = currentRoles . takeRight ( numberOfNodes )
val title = s" join ${ numberOfNodes } to ${ if ( toSeedNodes ) "seed nodes" else "one node" } , in ${ nbrUsedRoles } nodes cluster "
createResultAggregator ( title , expectedResults = currentRoles . size , includeInHistory = true )
runOn ( currentRoles : _ * ) {
reportResult {
runOn ( joiningRoles : _ * ) {
if ( toSeedNodes ) cluster . joinSeedNodes ( seedNodes . toIndexedSeq map address )
else cluster . join ( roles . head )
}
2013-03-05 21:05:11 +01:00
awaitMembersUp ( currentRoles . size , timeout = remaining )
2012-12-12 11:49:20 +01:00
}
}
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2012-12-12 11:49:20 +01:00
enterBarrier ( "join-several-" + step )
}
def removeOneByOne ( numberOfNodes : Int , shutdown : Boolean ) : Unit = {
0 until numberOfNodes foreach { _ ⇒
removeOne ( shutdown )
nbrUsedRoles -= 1
step += 1
}
}
2013-05-09 09:49:59 +02:00
def removeOne ( shutdown : Boolean ) : Unit = within ( 25. seconds + convergenceWithin ( 3. seconds , nbrUsedRoles - 1 ) ) {
2012-12-12 11:49:20 +01:00
val currentRoles = roles . take ( nbrUsedRoles - 1 )
val title = s" ${ if ( shutdown ) "shutdown" else "remove" } one from ${ nbrUsedRoles } nodes cluster "
createResultAggregator ( title , expectedResults = currentRoles . size , includeInHistory = true )
val removeRole = roles ( nbrUsedRoles - 1 )
val removeAddress = address ( removeRole )
runOn ( removeRole ) {
system . actorOf ( Props [ Watchee ] , "watchee" )
if ( ! shutdown ) cluster . leave ( myself )
}
enterBarrier ( "watchee-created-" + step )
runOn ( roles . head ) {
2013-03-26 18:17:50 +01:00
system . actorSelection ( node ( removeRole ) / "user" / "watchee" ) . tell ( Identify ( "watchee" ) , identifyProbe . ref )
val watchee = identifyProbe . expectMsgType [ ActorIdentity ] . ref . get
watch ( watchee )
2012-12-12 11:49:20 +01:00
}
enterBarrier ( "watch-estabilished-" + step )
runOn ( currentRoles : _ * ) {
reportResult {
runOn ( roles . head ) {
if ( shutdown ) {
2013-04-23 15:05:27 +02:00
if ( infolog )
log . info ( "Shutting down [{}]" , removeAddress )
2013-04-23 16:44:14 +02:00
testConductor . exit ( removeRole , 0 ) . await
2012-12-12 11:49:20 +01:00
}
}
2013-03-05 21:05:11 +01:00
awaitMembersUp ( currentRoles . size , timeout = remaining )
2013-04-23 15:05:27 +02:00
awaitAllReachable ( )
2012-12-12 11:49:20 +01:00
}
}
runOn ( roles . head ) {
2013-03-13 16:01:57 +01:00
val expectedPath = RootActorPath ( removeAddress ) / "user" / "watchee"
2012-12-12 11:49:20 +01:00
expectMsgPF ( remaining ) {
2013-03-13 16:01:57 +01:00
case Terminated ( a ) if a . path == expectedPath ⇒ true
2012-12-12 11:49:20 +01:00
}
}
enterBarrier ( "watch-verified-" + step )
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2012-12-12 11:49:20 +01:00
enterBarrier ( "remove-one-" + step )
}
def removeSeveral ( numberOfNodes : Int , shutdown : Boolean ) : Unit =
2013-05-09 09:49:59 +02:00
within ( 25. seconds + convergenceWithin ( 5. seconds , nbrUsedRoles - numberOfNodes ) ) {
2012-12-12 11:49:20 +01:00
val currentRoles = roles . take ( nbrUsedRoles - numberOfNodes )
val removeRoles = roles . slice ( currentRoles . size , nbrUsedRoles )
val title = s" ${ if ( shutdown ) "shutdown" else "leave" } ${ numberOfNodes } in ${ nbrUsedRoles } nodes cluster "
createResultAggregator ( title , expectedResults = currentRoles . size , includeInHistory = true )
runOn ( removeRoles : _ * ) {
if ( ! shutdown ) cluster . leave ( myself )
}
runOn ( currentRoles : _ * ) {
reportResult {
runOn ( roles . head ) {
if ( shutdown ) removeRoles . foreach { r ⇒
2013-04-23 15:05:27 +02:00
if ( infolog )
log . info ( "Shutting down [{}]" , address ( r ) )
2013-04-23 16:44:14 +02:00
testConductor . exit ( r , 0 ) . await
2012-12-12 11:49:20 +01:00
}
}
2013-03-05 21:05:11 +01:00
awaitMembersUp ( currentRoles . size , timeout = remaining )
2013-04-23 15:05:27 +02:00
awaitAllReachable ( )
2012-12-12 11:49:20 +01:00
}
}
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2012-12-12 11:49:20 +01:00
enterBarrier ( "remove-several-" + step )
}
def reportResult [ T ] ( thunk : ⇒ T ) : T = {
val startTime = System . nanoTime
2013-05-27 16:34:31 +02:00
val startStats = clusterView . latestStats . gossipStats
2012-12-12 11:49:20 +01:00
val returnValue = thunk
2013-03-26 18:17:50 +01:00
clusterResultAggregator foreach {
2013-05-27 16:34:31 +02:00
_ ! ClusterResult ( cluster . selfAddress , ( System . nanoTime - startTime ) . nanos , latestGossipStats : - startStats )
2013-03-26 18:17:50 +01:00
}
2013-03-05 12:49:35 +01:00
2012-12-12 11:49:20 +01:00
returnValue
}
def exerciseJoinRemove ( title : String , duration : FiniteDuration ) : Unit = {
val activeRoles = roles . take ( numberOfNodesJoinRemove )
val loopDuration = 10. seconds + convergenceWithin ( 4. seconds , nbrUsedRoles + activeRoles . size )
2013-02-26 08:14:09 +01:00
val rounds = ( ( duration - loopDuration ) . toMillis / loopDuration . toMillis ) . max ( 1 ) . toInt
2012-12-12 11:49:20 +01:00
val usedRoles = roles . take ( nbrUsedRoles )
val usedAddresses = usedRoles . map ( address ( _ ) ) . toSet
@tailrec def loop ( counter : Int , previousAS : Option [ ActorSystem ] , allPreviousAddresses : Set [ Address ] ) : Option [ ActorSystem ] = {
2013-02-26 08:14:09 +01:00
if ( counter > rounds ) previousAS
2012-12-12 11:49:20 +01:00
else {
val t = title + " round " + counter
runOn ( usedRoles : _ * ) {
phiObserver ! Reset
2013-03-05 12:49:35 +01:00
statsObserver ! Reset
2012-12-12 11:49:20 +01:00
}
createResultAggregator ( t , expectedResults = nbrUsedRoles , includeInHistory = true )
val ( nextAS , nextAddresses ) = within ( loopDuration ) {
reportResult {
val nextAS =
if ( activeRoles contains myself ) {
2013-05-02 17:12:36 +02:00
previousAS foreach { as ⇒ TestKit . shutdownActorSystem ( as ) }
2012-12-12 11:49:20 +01:00
val sys = ActorSystem ( system . name , system . settings . config )
muteLog ( sys )
Cluster ( sys ) . joinSeedNodes ( seedNodes . toIndexedSeq map address )
Some ( sys )
} else previousAS
runOn ( usedRoles : _ * ) {
2013-03-05 21:05:11 +01:00
awaitMembersUp (
2012-12-12 11:49:20 +01:00
nbrUsedRoles + activeRoles . size ,
canNotBePartOfMemberRing = allPreviousAddresses ,
timeout = remaining )
2013-04-23 15:05:27 +02:00
awaitAllReachable ( )
2012-12-12 11:49:20 +01:00
}
val nextAddresses = clusterView . members . map ( _ . address ) -- usedAddresses
runOn ( usedRoles : _ * ) {
nextAddresses . size must be ( numberOfNodesJoinRemove )
}
enterBarrier ( "join-remove-" + step )
( nextAS , nextAddresses )
}
}
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2012-12-12 11:49:20 +01:00
step += 1
loop ( counter + 1 , nextAS , nextAddresses )
}
}
2013-05-02 17:12:36 +02:00
loop ( 1 , None , Set . empty ) foreach { as ⇒ TestKit . shutdownActorSystem ( as ) }
2012-12-12 11:49:20 +01:00
within ( loopDuration ) {
runOn ( usedRoles : _ * ) {
2013-03-05 21:05:11 +01:00
awaitMembersUp ( nbrUsedRoles , timeout = remaining )
2013-04-23 15:05:27 +02:00
awaitAllReachable ( )
2012-12-12 11:49:20 +01:00
phiObserver ! Reset
2013-03-05 12:49:35 +01:00
statsObserver ! Reset
2012-12-12 11:49:20 +01:00
}
}
enterBarrier ( "join-remove-shutdown-" + step )
}
2013-03-26 18:17:50 +01:00
def masterName : String = "master-" + myself . name
def master : Option [ ActorRef ] = {
system . actorSelection ( "/user/" + masterName ) . tell ( Identify ( "master" ) , identifyProbe . ref )
identifyProbe . expectMsgType [ ActorIdentity ] . ref
}
2012-12-12 11:49:20 +01:00
def exerciseRouters ( title : String , duration : FiniteDuration , batchInterval : FiniteDuration ,
expectDroppedMessages : Boolean , tree : Boolean ) : Unit =
within ( duration + 10. seconds ) {
2013-04-23 15:05:27 +02:00
nbrUsedRoles must be ( totalNumberOfNodes )
2012-12-12 11:49:20 +01:00
createResultAggregator ( title , expectedResults = nbrUsedRoles , includeInHistory = false )
val ( masterRoles , otherRoles ) = roles . take ( nbrUsedRoles ) . splitAt ( 3 )
runOn ( masterRoles : _ * ) {
reportResult {
2013-05-29 16:13:10 +02:00
val m = system . actorOf ( Props ( classOf [ Master ] , settings , batchInterval , tree ) . withDeploy ( Deploy . local ) ,
2012-12-12 11:49:20 +01:00
name = "master-" + myself . name )
m ! Begin
import system.dispatcher
2013-01-15 09:35:07 +01:00
system . scheduler . scheduleOnce ( duration ) {
2012-12-12 11:49:20 +01:00
m . tell ( End , testActor )
}
val workResult = awaitWorkResult
workResult . sendCount must be > ( 0L )
workResult . ackCount must be > ( 0L )
if ( ! expectDroppedMessages )
workResult . retryCount must be ( 0 )
enterBarrier ( "routers-done-" + step )
}
}
runOn ( otherRoles : _ * ) {
reportResult {
enterBarrier ( "routers-done-" + step )
}
}
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2012-12-12 11:49:20 +01:00
}
def awaitWorkResult : WorkResult = {
val workResult = expectMsgType [ WorkResult ]
2013-04-23 15:05:27 +02:00
if ( settings . infolog )
2013-04-22 20:39:32 +02:00
log . info ( "{} result, [{}] jobs/s, retried [{}] of [{}] msg" , masterName ,
workResult . jobsPerSecond . form ,
workResult . retryCount , workResult . sendCount )
2013-03-26 18:17:50 +01:00
master match {
case Some ( m ) ⇒
watch ( m )
expectMsgPF ( remaining ) { case Terminated ( a ) if a . path == m . path ⇒ true }
case None ⇒ // ok, already terminated
}
2012-12-12 11:49:20 +01:00
workResult
}
2013-02-26 08:14:09 +01:00
def exerciseSupervision ( title : String , duration : FiniteDuration , oneIteration : Duration ) : Unit =
2012-12-12 11:49:20 +01:00
within ( duration + 10. seconds ) {
2013-02-26 08:14:09 +01:00
val rounds = ( duration . toMillis / oneIteration . toMillis ) . max ( 1 ) . toInt
2012-12-12 11:49:20 +01:00
val supervisor = system . actorOf ( Props [ Supervisor ] , "supervisor" )
2013-03-13 16:01:57 +01:00
for ( count ← 0 until rounds ) {
2012-12-12 11:49:20 +01:00
createResultAggregator ( title , expectedResults = nbrUsedRoles , includeInHistory = false )
2013-04-23 15:05:27 +02:00
val ( masterRoles , otherRoles ) = roles . take ( nbrUsedRoles ) . splitAt ( 3 )
runOn ( masterRoles : _ * ) {
reportResult {
roles . take ( nbrUsedRoles ) foreach { r ⇒
supervisor ! Props [ RemoteChild ] . withDeploy ( Deploy ( scope = RemoteScope ( address ( r ) ) ) )
}
2012-12-12 11:49:20 +01:00
supervisor ! GetChildrenCount
2013-04-23 15:05:27 +02:00
expectMsgType [ ChildrenCount ] must be ( ChildrenCount ( nbrUsedRoles , 0 ) )
1 to 5 foreach { _ ⇒ supervisor ! new RuntimeException ( "Simulated exception" ) }
awaitAssert {
supervisor ! GetChildrenCount
val c = expectMsgType [ ChildrenCount ]
c must be ( ChildrenCount ( nbrUsedRoles , 5 * nbrUsedRoles ) )
}
// after 5 restart attempts the children should be stopped
supervisor ! new RuntimeException ( "Simulated exception" )
awaitAssert {
supervisor ! GetChildrenCount
val c = expectMsgType [ ChildrenCount ]
// zero children
c must be ( ChildrenCount ( 0 , 6 * nbrUsedRoles ) )
}
supervisor ! Reset
2012-12-12 11:49:20 +01:00
}
2013-04-23 15:05:27 +02:00
enterBarrier ( "supervision-done-" + step )
}
2012-12-12 11:49:20 +01:00
2013-04-23 15:05:27 +02:00
runOn ( otherRoles : _ * ) {
reportResult {
enterBarrier ( "supervision-done-" + step )
}
2012-12-12 11:49:20 +01:00
}
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2013-01-17 13:37:01 +01:00
step += 1
2012-12-12 11:49:20 +01:00
}
}
2013-04-23 15:05:27 +02:00
def idleGossip ( title : String ) : Unit = {
createResultAggregator ( title , expectedResults = nbrUsedRoles , includeInHistory = true )
reportResult {
clusterView . members . size must be ( nbrUsedRoles )
Thread . sleep ( idleGossipDuration . toMillis )
clusterView . members . size must be ( nbrUsedRoles )
}
awaitClusterResult ( )
}
2012-12-12 11:49:20 +01:00
"A cluster under stress" must {
2013-04-23 15:05:27 +02:00
"log settings" taggedAs LongRunningTest in {
if ( infolog ) {
log . info ( "StressSpec JVM:\n{}" , jvmInfo )
runOn ( roles . head ) {
log . info ( "StressSpec settings:\n{}" , settings )
}
}
enterBarrier ( "after-" + step )
}
2013-02-26 10:46:54 +01:00
"join seed nodes" taggedAs LongRunningTest in within ( 30 seconds ) {
2012-12-12 11:49:20 +01:00
val otherNodesJoiningSeedNodes = roles . slice ( numberOfSeedNodes , numberOfSeedNodes + numberOfNodesJoiningToSeedNodesInitially )
val size = seedNodes . size + otherNodesJoiningSeedNodes . size
createResultAggregator ( "join seed nodes" , expectedResults = size , includeInHistory = true )
runOn ( ( seedNodes ++ otherNodesJoiningSeedNodes ) : _ * ) {
reportResult {
cluster . joinSeedNodes ( seedNodes . toIndexedSeq map address )
2013-03-05 21:05:11 +01:00
awaitMembersUp ( size , timeout = remaining )
2012-12-12 11:49:20 +01:00
}
}
2013-03-28 23:45:48 +01:00
awaitClusterResult ( )
2012-12-12 11:49:20 +01:00
nbrUsedRoles += size
enterBarrier ( "after-" + step )
}
"start routers that are running while nodes are joining" taggedAs LongRunningTest in {
runOn ( roles . take ( 3 ) : _ * ) {
2013-05-29 16:13:10 +02:00
system . actorOf ( Props ( classOf [ Master ] , settings , settings . workBatchInterval , false ) . withDeploy ( Deploy . local ) ,
2012-12-12 11:49:20 +01:00
name = "master-" + myself . name ) ! Begin
}
}
"join nodes one-by-one to small cluster" taggedAs LongRunningTest in {
joinOneByOne ( numberOfNodesJoiningOneByOneSmall )
enterBarrier ( "after-" + step )
}
"join several nodes to one node" taggedAs LongRunningTest in {
joinSeveral ( numberOfNodesJoiningToOneNode , toSeedNodes = false )
nbrUsedRoles += numberOfNodesJoiningToOneNode
enterBarrier ( "after-" + step )
}
"join several nodes to seed nodes" taggedAs LongRunningTest in {
2013-04-23 15:05:27 +02:00
if ( numberOfNodesJoiningToSeedNodes > 0 ) {
joinSeveral ( numberOfNodesJoiningToSeedNodes , toSeedNodes = true )
nbrUsedRoles += numberOfNodesJoiningToSeedNodes
}
2012-12-12 11:49:20 +01:00
enterBarrier ( "after-" + step )
}
"join nodes one-by-one to large cluster" taggedAs LongRunningTest in {
joinOneByOne ( numberOfNodesJoiningOneByOneLarge )
enterBarrier ( "after-" + step )
}
"end routers that are running while nodes are joining" taggedAs LongRunningTest in within ( 30. seconds ) {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
runOn ( roles . take ( 3 ) : _ * ) {
val m = master
m must not be ( None )
m . get . tell ( End , testActor )
val workResult = awaitWorkResult
workResult . retryCount must be ( 0 )
workResult . sendCount must be > ( 0L )
workResult . ackCount must be > ( 0L )
}
2012-12-12 11:49:20 +01:00
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2012-12-12 11:49:20 +01:00
}
"use routers with normal throughput" taggedAs LongRunningTest in {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
exerciseRouters ( "use routers with normal throughput" , normalThroughputDuration ,
batchInterval = workBatchInterval , expectDroppedMessages = false , tree = false )
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2012-12-12 11:49:20 +01:00
}
"use routers with high throughput" taggedAs LongRunningTest in {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
exerciseRouters ( "use routers with high throughput" , highThroughputDuration ,
batchInterval = Duration . Zero , expectDroppedMessages = false , tree = false )
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2012-12-12 11:49:20 +01:00
}
"use many actors with normal throughput" taggedAs LongRunningTest in {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
exerciseRouters ( "use many actors with normal throughput" , normalThroughputDuration ,
batchInterval = workBatchInterval , expectDroppedMessages = false , tree = true )
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2012-12-12 11:49:20 +01:00
}
"use many actors with high throughput" taggedAs LongRunningTest in {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
exerciseRouters ( "use many actors with high throughput" , highThroughputDuration ,
batchInterval = Duration . Zero , expectDroppedMessages = false , tree = true )
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2012-12-12 11:49:20 +01:00
}
2013-02-26 08:14:09 +01:00
"exercise join/remove/join/remove" taggedAs LongRunningTest in {
exerciseJoinRemove ( "exercise join/remove" , joinRemoveDuration )
2012-12-12 11:49:20 +01:00
enterBarrier ( "after-" + step )
}
2013-02-26 08:14:09 +01:00
"exercise supervision" taggedAs LongRunningTest in {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
exerciseSupervision ( "exercise supervision" , supervisionDuration , supervisionOneIteration )
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2013-04-23 15:05:27 +02:00
}
"gossip when idle" taggedAs LongRunningTest in {
idleGossip ( "idle gossip" )
2012-12-12 11:49:20 +01:00
enterBarrier ( "after-" + step )
}
"start routers that are running while nodes are removed" taggedAs LongRunningTest in {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
runOn ( roles . take ( 3 ) : _ * ) {
2013-05-29 16:13:10 +02:00
system . actorOf ( Props ( classOf [ Master ] , settings , settings . workBatchInterval , false ) . withDeploy ( Deploy . local ) ,
2013-04-23 15:05:27 +02:00
name = "master-" + myself . name ) ! Begin
}
2012-12-12 11:49:20 +01:00
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2012-12-12 11:49:20 +01:00
}
"leave nodes one-by-one from large cluster" taggedAs LongRunningTest in {
removeOneByOne ( numberOfNodesLeavingOneByOneLarge , shutdown = false )
enterBarrier ( "after-" + step )
}
"shutdown nodes one-by-one from large cluster" taggedAs LongRunningTest in {
removeOneByOne ( numberOfNodesShutdownOneByOneLarge , shutdown = true )
enterBarrier ( "after-" + step )
}
"leave several nodes" taggedAs LongRunningTest in {
removeSeveral ( numberOfNodesLeaving , shutdown = false )
nbrUsedRoles -= numberOfNodesLeaving
enterBarrier ( "after-" + step )
}
"shutdown several nodes" taggedAs LongRunningTest in {
removeSeveral ( numberOfNodesShutdown , shutdown = true )
nbrUsedRoles -= numberOfNodesShutdown
enterBarrier ( "after-" + step )
}
2013-04-23 15:05:27 +02:00
"shutdown nodes one-by-one from small cluster" taggedAs LongRunningTest in {
removeOneByOne ( numberOfNodesShutdownOneByOneSmall , shutdown = true )
2012-12-12 11:49:20 +01:00
enterBarrier ( "after-" + step )
}
2013-04-23 15:05:27 +02:00
"leave nodes one-by-one from small cluster" taggedAs LongRunningTest in {
removeOneByOne ( numberOfNodesLeavingOneByOneSmall , shutdown = false )
2012-12-12 11:49:20 +01:00
enterBarrier ( "after-" + step )
}
"end routers that are running while nodes are removed" taggedAs LongRunningTest in within ( 30. seconds ) {
2013-04-23 15:05:27 +02:00
if ( exerciseActors ) {
runOn ( roles . take ( 3 ) : _ * ) {
val m = master
m must not be ( None )
m . get . tell ( End , testActor )
val workResult = awaitWorkResult
workResult . sendCount must be > ( 0L )
workResult . ackCount must be > ( 0L )
}
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2013-04-23 15:05:27 +02:00
}
"log jvm info" taggedAs LongRunningTest in {
if ( infolog ) {
log . info ( "StressSpec JVM:\n{}" , jvmInfo )
2012-12-12 11:49:20 +01:00
}
2013-05-02 20:23:14 +02:00
enterBarrier ( "after-" + step )
2012-12-12 11:49:20 +01:00
}
}
}