recovery timeout for persistent actors #20698
This commit is contained in:
parent
32e72f8208
commit
896ea53dd3
6 changed files with 161 additions and 44 deletions
|
|
@ -65,11 +65,13 @@ abstract class QuickRestartSpec
|
|||
runOn(second) {
|
||||
restartingSystem =
|
||||
if (restartingSystem == null)
|
||||
ActorSystem(system.name,
|
||||
ActorSystem(
|
||||
system.name,
|
||||
ConfigFactory.parseString(s"akka.cluster.roles = [round-$n]")
|
||||
.withFallback(system.settings.config))
|
||||
else
|
||||
ActorSystem(system.name,
|
||||
ActorSystem(
|
||||
system.name,
|
||||
ConfigFactory.parseString(s"""
|
||||
akka.cluster.roles = [round-$n]
|
||||
akka.remote.netty.tcp.port = ${Cluster(restartingSystem).selfAddress.port.get}""") // same port
|
||||
|
|
|
|||
|
|
@ -114,6 +114,12 @@ akka.persistence {
|
|||
# as it has accumulated since the last write.
|
||||
max-message-batch-size = 200
|
||||
|
||||
# If there is more time in between individual events gotten from the journal
|
||||
# recovery than this the recovery will fail.
|
||||
# Note that it also affects reading the snapshot before replaying events on
|
||||
# top of it, even though it is configured for the journal.
|
||||
recovery-event-timeout = 30s
|
||||
|
||||
circuit-breaker {
|
||||
max-failures = 10
|
||||
call-timeout = 10s
|
||||
|
|
|
|||
|
|
@ -9,11 +9,13 @@ import java.util.UUID
|
|||
|
||||
import scala.collection.immutable
|
||||
import scala.util.control.NonFatal
|
||||
import akka.actor.DeadLetter
|
||||
import akka.actor.StashOverflowException
|
||||
import akka.actor.{ DeadLetter, ReceiveTimeout, StashOverflowException }
|
||||
import akka.util.Helpers.ConfigOps
|
||||
import akka.event.Logging
|
||||
import akka.event.LoggingAdapter
|
||||
|
||||
import scala.concurrent.duration.{ Duration, FiniteDuration }
|
||||
|
||||
/**
|
||||
* INTERNAL API
|
||||
*/
|
||||
|
|
@ -461,6 +463,10 @@ private[persistence] trait Eventsourced extends Snapshotter with PersistenceStas
|
|||
*/
|
||||
private def recoveryStarted(replayMax: Long) = new State {
|
||||
|
||||
// protect against replay stalling forever because of journal overloaded and such
|
||||
private val previousRecieveTimeout = context.receiveTimeout
|
||||
context.setReceiveTimeout(extension.journalConfigFor(journalPluginId).getMillisDuration("recovery-event-timeout"))
|
||||
|
||||
private val recoveryBehavior: Receive = {
|
||||
val _receiveRecover = receiveRecover
|
||||
|
||||
|
|
@ -471,6 +477,7 @@ private[persistence] trait Eventsourced extends Snapshotter with PersistenceStas
|
|||
_receiveRecover(s)
|
||||
case RecoveryCompleted if _receiveRecover.isDefinedAt(RecoveryCompleted) ⇒
|
||||
_receiveRecover(RecoveryCompleted)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -485,8 +492,13 @@ private[persistence] trait Eventsourced extends Snapshotter with PersistenceStas
|
|||
// Since we are recovering we can ignore the receive behavior from the stack
|
||||
Eventsourced.super.aroundReceive(recoveryBehavior, SnapshotOffer(metadata, snapshot))
|
||||
}
|
||||
changeState(recovering(recoveryBehavior))
|
||||
changeState(recovering(recoveryBehavior, previousRecieveTimeout))
|
||||
journal ! ReplayMessages(lastSequenceNr + 1L, toSnr, replayMax, persistenceId, self)
|
||||
case ReceiveTimeout ⇒
|
||||
try onRecoveryFailure(
|
||||
new RecoveryTimedOut(s"Recovery timed out, didn't get snapshot within ${context.receiveTimeout.toSeconds}s"),
|
||||
event = None)
|
||||
finally context.stop(self)
|
||||
case other ⇒
|
||||
stashInternally(other)
|
||||
}
|
||||
|
|
@ -502,8 +514,10 @@ private[persistence] trait Eventsourced extends Snapshotter with PersistenceStas
|
|||
*
|
||||
* All incoming messages are stashed.
|
||||
*/
|
||||
private def recovering(recoveryBehavior: Receive) = new State {
|
||||
private def recovering(recoveryBehavior: Receive, previousReceiveTimeout: Duration) =
|
||||
new State {
|
||||
override def toString: String = "replay started"
|
||||
|
||||
override def recoveryRunning: Boolean = true
|
||||
|
||||
override def stateReceive(receive: Receive, message: Any) = message match {
|
||||
|
|
@ -516,6 +530,7 @@ private[persistence] trait Eventsourced extends Snapshotter with PersistenceStas
|
|||
try onRecoveryFailure(t, Some(p.payload)) finally context.stop(self)
|
||||
}
|
||||
case RecoverySuccess(highestSeqNr) ⇒
|
||||
resetRecieveTimeout()
|
||||
onReplaySuccess() // callback for subclass implementation
|
||||
changeState(processingCommands)
|
||||
sequenceNr = highestSeqNr
|
||||
|
|
@ -523,10 +538,20 @@ private[persistence] trait Eventsourced extends Snapshotter with PersistenceStas
|
|||
internalStash.unstashAll()
|
||||
Eventsourced.super.aroundReceive(recoveryBehavior, RecoveryCompleted)
|
||||
case ReplayMessagesFailure(cause) ⇒
|
||||
resetRecieveTimeout()
|
||||
try onRecoveryFailure(cause, event = None) finally context.stop(self)
|
||||
case ReceiveTimeout ⇒
|
||||
try onRecoveryFailure(
|
||||
new RecoveryTimedOut(s"Recovery timed out, didn't get event within ${context.receiveTimeout.toSeconds}s, highest sequence number seen ${sequenceNr}"),
|
||||
event = None)
|
||||
finally context.stop(self)
|
||||
case other ⇒
|
||||
stashInternally(other)
|
||||
}
|
||||
|
||||
private def resetRecieveTimeout(): Unit = {
|
||||
context.setReceiveTimeout(previousReceiveTimeout)
|
||||
}
|
||||
}
|
||||
|
||||
private def flushBatch() {
|
||||
|
|
|
|||
|
|
@ -6,15 +6,18 @@ package akka.persistence
|
|||
|
||||
import java.util.concurrent.atomic.AtomicReference
|
||||
import java.util.function.Consumer
|
||||
|
||||
import akka.actor._
|
||||
import akka.event.{ Logging, LoggingAdapter }
|
||||
import akka.persistence.journal.{ EventAdapters, IdentityEventAdapters }
|
||||
import akka.util.Collections.EmptyImmutableSeq
|
||||
import akka.util.Helpers.ConfigOps
|
||||
import com.typesafe.config.Config
|
||||
|
||||
import scala.annotation.tailrec
|
||||
import scala.concurrent.duration._
|
||||
import akka.util.Reflect
|
||||
|
||||
import scala.util.control.NonFatal
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -4,11 +4,14 @@
|
|||
package akka.persistence
|
||||
|
||||
import java.lang.{ Iterable ⇒ JIterable }
|
||||
|
||||
import akka.actor._
|
||||
import akka.japi.Procedure
|
||||
import akka.japi.Util
|
||||
import com.typesafe.config.Config
|
||||
|
||||
import scala.util.control.NoStackTrace
|
||||
|
||||
abstract class RecoveryCompleted
|
||||
/**
|
||||
* Sent to a [[PersistentActor]] when the journal replay has been finished.
|
||||
|
|
@ -98,6 +101,8 @@ object Recovery {
|
|||
val none: Recovery = Recovery(toSequenceNr = 0L)
|
||||
}
|
||||
|
||||
final class RecoveryTimedOut(message: String) extends RuntimeException(message) with NoStackTrace
|
||||
|
||||
/**
|
||||
* This defines how to handle the current received message which failed to stash, when the size of
|
||||
* Stash exceeding the capacity of Stash.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
package akka.persistence
|
||||
|
||||
import akka.actor.Status.Failure
|
||||
import akka.actor.{ Actor, ActorRef, Props }
|
||||
import akka.persistence.journal.SteppingInmemJournal
|
||||
import akka.testkit.{ AkkaSpec, ImplicitSender, TestProbe }
|
||||
import com.typesafe.config.ConfigFactory
|
||||
|
||||
import scala.concurrent.duration._
|
||||
|
||||
object PersistentActorRecoveryTimeoutSpec {
|
||||
val journalId = "persistent-actor-recovery-timeout-spec"
|
||||
|
||||
def config =
|
||||
SteppingInmemJournal.config(PersistentActorRecoveryTimeoutSpec.journalId).withFallback(
|
||||
ConfigFactory.parseString(
|
||||
"""
|
||||
|akka.persistence.journal.stepping-inmem.recovery-event-timeout=100ms
|
||||
""".stripMargin)).withFallback(PersistenceSpec.config("stepping-inmem", "PersistentActorRecoveryTimeoutSpec"))
|
||||
|
||||
class TestActor(probe: ActorRef) extends NamedPersistentActor("recovery-timeout-actor") {
|
||||
override def receiveRecover: Receive = Actor.emptyBehavior
|
||||
|
||||
override def receiveCommand: Receive = {
|
||||
case x ⇒ persist(x) { _ ⇒
|
||||
sender() ! x
|
||||
}
|
||||
}
|
||||
|
||||
override protected def onRecoveryFailure(cause: Throwable, event: Option[Any]): Unit = {
|
||||
probe ! Failure(cause)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class PersistentActorRecoveryTimeoutSpec extends AkkaSpec(PersistentActorRecoveryTimeoutSpec.config) with ImplicitSender {
|
||||
|
||||
import PersistentActorRecoveryTimeoutSpec.journalId
|
||||
|
||||
"The recovery timeout" should {
|
||||
|
||||
"fail recovery if timeout is not met when recovering" in {
|
||||
val probe = TestProbe()
|
||||
val persisting = system.actorOf(Props(classOf[PersistentActorRecoveryTimeoutSpec.TestActor], probe.ref))
|
||||
|
||||
awaitAssert(SteppingInmemJournal.getRef(journalId), 3.seconds)
|
||||
val journal = SteppingInmemJournal.getRef(journalId)
|
||||
|
||||
// initial read highest
|
||||
SteppingInmemJournal.step(journal)
|
||||
|
||||
persisting ! "A"
|
||||
SteppingInmemJournal.step(journal)
|
||||
expectMsg("A")
|
||||
|
||||
watch(persisting)
|
||||
system.stop(persisting)
|
||||
expectTerminated(persisting)
|
||||
|
||||
// now replay, but don't give the journal any tokens to replay events
|
||||
// so that we cause the timeout to trigger
|
||||
val replaying = system.actorOf(Props(classOf[PersistentActorRecoveryTimeoutSpec.TestActor], probe.ref))
|
||||
watch(replaying)
|
||||
|
||||
// initial read highest
|
||||
SteppingInmemJournal.step(journal)
|
||||
|
||||
probe.expectMsgType[Failure].cause shouldBe a[RecoveryTimedOut]
|
||||
expectTerminated(replaying)
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue