=clu #18577 Option to stop cluster client after no receptionist contact timeout

2016-01-07 19:54:55 +01:00 · 2016-01-07 19:54:55 +01:00 · 19c7017482
commit 19c7017482
parent 673174a64a
6 changed files with 227 additions and 26 deletions
--- a/akka-cluster-tools/src/main/resources/reference.conf
+++ b/akka-cluster-tools/src/main/resources/reference.conf
@ -112,6 +112,15 @@ akka.cluster.client {
  # immediately if the location of the singleton is unknown.
  # Maximum allowed buffer size is 10000.
  buffer-size = 1000
  # If connection to the receiptionist is lost and the client has not been
  # able to acquire a new connection for this long the client will stop itself.
  # This duration makes it possible to watch the cluster client and react on a more permanent
  # loss of connection with the cluster, for example by accessing some kind of
  # service registry for an updated set of initial contacts to start a new cluster client with.
  # If this is not wanted it can be set to "off" to disable the timeout and retry
  # forever.
  reconnect-timeout = off
 }
 # //#cluster-client-config
--- a/akka-cluster-tools/src/main/scala/akka/cluster/client/ClusterClient.scala
+++ b/akka-cluster-tools/src/main/scala/akka/cluster/client/ClusterClient.scala
@ -58,7 +58,11 @@ object ClusterClientSettings {
      refreshContactsInterval = config.getDuration("refresh-contacts-interval", MILLISECONDS).millis,
      heartbeatInterval = config.getDuration("heartbeat-interval", MILLISECONDS).millis,
      acceptableHeartbeatPause = config.getDuration("acceptable-heartbeat-pause", MILLISECONDS).millis,
-      bufferSize = config.getInt("buffer-size"))
+      bufferSize = config.getInt("buffer-size"),
      reconnectTimeout = config.getString("reconnect-timeout") match {
        case "off" ⇒ None
        case _     ⇒ Some(config.getDuration("reconnect-timeout", MILLISECONDS).millis)
      })
  }
  /**
@ -96,6 +100,10 @@ object ClusterClientSettings {
 *   When the buffer is full old messages will be dropped when new messages are sent via the
 *   client. Use 0 to disable buffering, i.e. messages will be dropped immediately if the
 *   location of the receptionist is unavailable.
 * @param reconnectTimeout If the connection to the receptionist is lost and cannot
 *   be re-established within this duration the cluster client will be stopped. This makes it possible
 *   to watch it from another actor and possibly acquire a new list of initialContacts from some
 *   external service registry
 */
 final class ClusterClientSettings(
  val initialContacts: Set[ActorPath],
@ -103,10 +111,24 @@ final class ClusterClientSettings(
  val refreshContactsInterval: FiniteDuration,
  val heartbeatInterval: FiniteDuration,
  val acceptableHeartbeatPause: FiniteDuration,
-  val bufferSize: Int) extends NoSerializationVerificationNeeded {
+  val bufferSize: Int,
  val reconnectTimeout: Option[FiniteDuration]) extends NoSerializationVerificationNeeded {
  require(bufferSize >= 0 && bufferSize <= 10000, "bufferSize must be >= 0 and <= 10000")
  /**
   * For binary/source compatibility
   */
  def this(
    initialContacts: Set[ActorPath],
    establishingGetContactsInterval: FiniteDuration,
    refreshContactsInterval: FiniteDuration,
    heartbeatInterval: FiniteDuration,
    acceptableHeartbeatPause: FiniteDuration,
    bufferSize: Int) =
    this(initialContacts, establishingGetContactsInterval, refreshContactsInterval, heartbeatInterval,
      acceptableHeartbeatPause, bufferSize, None)
  /**
   * Scala API
   */
@ -135,15 +157,19 @@ final class ClusterClientSettings(
  def withBufferSize(bufferSize: Int): ClusterClientSettings =
    copy(bufferSize = bufferSize)
  def withReconnectTimeout(reconnectTimeout: Option[FiniteDuration]): ClusterClientSettings =
    copy(reconnectTimeout = reconnectTimeout)
  private def copy(
    initialContacts: Set[ActorPath] = initialContacts,
    establishingGetContactsInterval: FiniteDuration = establishingGetContactsInterval,
    refreshContactsInterval: FiniteDuration = refreshContactsInterval,
    heartbeatInterval: FiniteDuration = heartbeatInterval,
    acceptableHeartbeatPause: FiniteDuration = acceptableHeartbeatPause,
-    bufferSize: Int = bufferSize): ClusterClientSettings =
+    bufferSize: Int = bufferSize,
    reconnectTimeout: Option[FiniteDuration] = reconnectTimeout): ClusterClientSettings =
    new ClusterClientSettings(initialContacts, establishingGetContactsInterval, refreshContactsInterval,
-      heartbeatInterval, acceptableHeartbeatPause, bufferSize)
+      heartbeatInterval, acceptableHeartbeatPause, bufferSize, reconnectTimeout)
 }
 object ClusterClient {
@ -172,6 +198,7 @@ object ClusterClient {
  private[akka] object Internal {
    case object RefreshContactsTick
    case object HeartbeatTick
    case object ReconnectTimeout
  }
 }
@ -257,6 +284,11 @@ final class ClusterClient(settings: ClusterClientSettings) extends Actor with Ac
  def receive = establishing
  def establishing: Actor.Receive = {
    val connectTimerCancelable = settings.reconnectTimeout.map { timeout ⇒
      context.system.scheduler.scheduleOnce(timeout, self, ReconnectTimeout)
    }
    {
      case Contacts(contactPoints) ⇒
        if (contactPoints.nonEmpty) {
          contacts = contactPoints.map(context.actorSelection)
@ -267,6 +299,7 @@ final class ClusterClient(settings: ClusterClientSettings) extends Actor with Ac
        scheduleRefreshContactsTick(refreshContactsInterval)
        sendBuffered(receptionist)
        context.become(active(receptionist))
        connectTimerCancelable.foreach(_.cancel())
        failureDetector.heartbeat()
      case ActorIdentity(_, None) ⇒ // ok, use another instead
      case HeartbeatTick ⇒
@ -278,6 +311,10 @@ final class ClusterClient(settings: ClusterClientSettings) extends Actor with Ac
        buffer(DistributedPubSubMediator.SendToAll(path, msg))
      case Publish(topic, msg) ⇒
        buffer(DistributedPubSubMediator.Publish(topic, msg))
      case ReconnectTimeout ⇒
        log.warning("Receptionist reconnect not successful within {} stopping cluster client", settings.reconnectTimeout)
        context.stop(self)
    }
  }
  def active(receptionist: ActorRef): Actor.Receive = {
--- a/akka-cluster-tools/src/multi-jvm/scala/akka/cluster/client/ClusterClientStopSpec.scala
+++ b/akka-cluster-tools/src/multi-jvm/scala/akka/cluster/client/ClusterClientStopSpec.scala
@ -0,0 +1,111 @@
 /**
 * Copyright (C) 2009-2016 Typesafe Inc. <http://www.typesafe.com>
 */
 package akka.cluster.client
 import akka.actor.{ Actor, Props, Terminated }
 import akka.cluster.Cluster
 import akka.cluster.pubsub.{ DistributedPubSub, DistributedPubSubMediator }
 import akka.remote.testconductor.RoleName
 import akka.remote.testkit.{ STMultiNodeSpec, MultiNodeSpec, MultiNodeConfig }
 import akka.testkit.{ EventFilter, ImplicitSender }
 import com.typesafe.config.ConfigFactory
 import scala.concurrent.Await
 import scala.concurrent.duration._
 object ClusterClientStopSpec extends MultiNodeConfig {
  val client = role("client")
  val first = role("first")
  val second = role("second")
  commonConfig(ConfigFactory.parseString("""
    akka.loglevel = INFO
    akka.actor.provider = "akka.cluster.ClusterActorRefProvider"
    akka.remote.log-remote-lifecycle-events = off
    akka.cluster.client {
      heartbeat-interval = 1s
      acceptable-heartbeat-pause = 1s
      reconnect-timeout = 3s
      receptionist.number-of-contacts = 1
    }
    akka.test.filter-leeway = 10s
  """))
  class Service extends Actor {
    def receive = {
      case msg ⇒ sender() ! msg
    }
  }
 }
 class ClusterClientStopMultiJvmNode1 extends ClusterClientStopSpec
 class ClusterClientStopMultiJvmNode2 extends ClusterClientStopSpec
 class ClusterClientStopMultiJvmNode3 extends ClusterClientStopSpec
 class ClusterClientStopSpec extends MultiNodeSpec(ClusterClientStopSpec) with STMultiNodeSpec with ImplicitSender {
  import ClusterClientStopSpec._
  override def initialParticipants: Int = 3
  def join(from: RoleName, to: RoleName): Unit = {
    runOn(from) {
      Cluster(system) join node(to).address
      ClusterClientReceptionist(system)
    }
    enterBarrier(from.name + "-joined")
  }
  def awaitCount(expected: Int): Unit = {
    awaitAssert {
      DistributedPubSub(system).mediator ! DistributedPubSubMediator.Count
      expectMsgType[Int] should ===(expected)
    }
  }
  def initialContacts = Set(first, second).map { r ⇒
    node(r) / "system" / "receptionist"
  }
  "A Cluster Client" should {
    "startup cluster" in within(30.seconds) {
      join(first, first)
      join(second, first)
      runOn(first) {
        val service = system.actorOf(Props(classOf[Service]), "testService")
        ClusterClientReceptionist(system).registerService(service)
      }
      runOn(first, second) {
        awaitCount(1)
      }
      enterBarrier("cluster-started")
    }
    "stop if re-establish fails for too long time" in within(20.seconds) {
      runOn(client) {
        val c = system.actorOf(ClusterClient.props(
          ClusterClientSettings(system).withInitialContacts(initialContacts)), "client1")
        c ! ClusterClient.Send("/user/testService", "hello", localAffinity = true)
        expectMsgType[String](3.seconds) should be("hello")
        enterBarrier("was-in-contact")
        watch(c)
        expectTerminated(c, 10.seconds)
        EventFilter.warning(start = "Receptionist reconnect not successful within", occurrences = 1)
      }
      runOn(first, second) {
        enterBarrier("was-in-contact")
        Await.ready(system.terminate(), 10.seconds)
      }
    }
  }
 }
--- a/akka-docs/rst/java/cluster-client.rst
+++ b/akka-docs/rst/java/cluster-client.rst
@ -146,3 +146,25 @@ a parameter to the ``ClusterClient.props`` factory method, i.e. each client can
 with different settings if needed.
 .. includecode:: ../../../akka-cluster-tools/src/main/resources/reference.conf#cluster-client-config
 Failure handling
 ----------------
 When the cluster client is started it must be provided with a list of initial contacts which are cluster
 nodes where receptionists are running. It will then repeatedly (with an interval configurable
 by ``establishing-get-contacts-interval``) try to contact those until it gets in contact with one of them.
 While running, the list of contacts are continuously updated with data from the receptionists (again, with an
 interval configurable with ``refresh-contacts-interval``), so that if there are more receptionists in the cluster
 than the initial contacts provided to the client the client will learn about them.
 While the client is running it will detect failures in its connection to the receptionist by heartbeats
 if more than a configurable amount of heartbeats are missed the client will try to reconnect to its known
 set of contacts to find a receptionist it can access.
 When the cluster cannot be reached at all
 -----------------------------------------
 It is possible to make the cluster client stop entirely if it cannot find a receptionist it can talk to
 within a configurable interval. This is configured with the ``reconnect-timeout``, which defaults to ``off``.
 This can be useful when initial contacts are provided from some kind of service registry, cluster node addresses
 are entirely dynamic and the entire cluster might shut down or crash, be restarted on new addresses. Since the
 client will be stopped in that case a monitoring actor can watch it and upon ``Terminate`` a new set of initial
 contacts can be fetched and a new cluster client started.
--- a/akka-docs/rst/scala/cluster-client.rst
+++ b/akka-docs/rst/scala/cluster-client.rst
@ -146,3 +146,25 @@ a parameter to the ``ClusterClient.props`` factory method, i.e. each client can
 with different settings if needed.
 .. includecode:: ../../../akka-cluster-tools/src/main/resources/reference.conf#cluster-client-config
 Failure handling
 ----------------
 When the cluster client is started it must be provided with a list of initial contacts which are cluster
 nodes where receptionists are running. It will then repeatedly (with an interval configurable
 by ``establishing-get-contacts-interval``) try to contact those until it gets in contact with one of them.
 While running, the list of contacts are continuously updated with data from the receptionists (again, with an
 interval configurable with ``refresh-contacts-interval``), so that if there are more receptionists in the cluster
 than the initial contacts provided to the client the client will learn about them.
 While the client is running it will detect failures in its connection to the receptionist by heartbeats
 if more than a configurable amount of heartbeats are missed the client will try to reconnect to its known
 set of contacts to find a receptionist it can access.
 When the cluster cannot be reached at all
 -----------------------------------------
 It is possible to make the cluster client stop entirely if it cannot find a receptionist it can talk to
 within a configurable interval. This is configured with the ``reconnect-timeout``, which defaults to ``off``.
 This can be useful when initial contacts are provided from some kind of service registry, cluster node addresses
 are entirely dynamic and the entire cluster might shut down or crash, be restarted on new addresses. Since the
 client will be stopped in that case a monitoring actor can watch it and upon ``Terminate`` a new set of initial
 contacts can be fetched and a new cluster client started.
--- a/akka-persistence-query/src/test/scala/akka/persistence/query/journal/leveldb/EventsByPersistenceIdSpec.scala
+++ b/akka-persistence-query/src/test/scala/akka/persistence/query/journal/leveldb/EventsByPersistenceIdSpec.scala
@ -120,7 +120,7 @@ class EventsByPersistenceIdSpec extends AkkaSpec(EventsByPersistenceIdSpec.confi
      expectMsg(s"${2L}-deleted")
      val src = queries.currentEventsByPersistenceId("h", 0L, Long.MaxValue)
-      src.map(_.event).runWith(TestSink.probe[Any]).request(1).expectNext("h-3") expectComplete()
+      src.map(_.event).runWith(TestSink.probe[Any]).request(1).expectNext("h-3").expectComplete()
    }
    "return empty stream for empty journal" in {