From d5fb35eee19321c50f486e473b92f9223e19c156 Mon Sep 17 00:00:00 2001 From: phaller Date: Thu, 31 May 2012 13:53:03 +0200 Subject: [PATCH 01/39] Correcting example in ScalaDoc for Stash --- akka-actor/src/main/scala/akka/actor/Stash.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/akka-actor/src/main/scala/akka/actor/Stash.scala b/akka-actor/src/main/scala/akka/actor/Stash.scala index 386bc0f070..2415b38618 100644 --- a/akka-actor/src/main/scala/akka/actor/Stash.scala +++ b/akka-actor/src/main/scala/akka/actor/Stash.scala @@ -15,7 +15,8 @@ import akka.AkkaException * class ActorWithProtocol extends Actor with Stash { * def receive = { * case "open" ⇒ - * unstashAll { + * unstashAll() + * context.become { * case "write" ⇒ // do writing... * case "close" ⇒ * unstashAll() From f385380cc277e40d323e086896251faa34f29d3b Mon Sep 17 00:00:00 2001 From: phaller Date: Tue, 12 Jun 2012 15:51:54 +0200 Subject: [PATCH 02/39] Adding Stash section in Actors docs. Including example added to ActorDocSpec. --- akka-docs/scala/actors.rst | 42 +++++++++++++++++++ .../scala/code/docs/actor/ActorDocSpec.scala | 20 +++++++++ 2 files changed, 62 insertions(+) diff --git a/akka-docs/scala/actors.rst b/akka-docs/scala/actors.rst index 4a556cf6c2..15ec6c9054 100644 --- a/akka-docs/scala/actors.rst +++ b/akka-docs/scala/actors.rst @@ -627,6 +627,48 @@ Here's how you use the ``unbecome`` method: } +Stash +===== + +The `Stash` trait enables an actor to temporarily stash away messages +that can not or should not be handled using the actor's current +behavior. Upon changing the actor's message handler, i.e., right +before invoking ``context.become`` or ``context.unbecome``, all +stashed messages can be "unstashed" by prepending them to the actor's +mailbox. This way, the stashed messages can be processed in the same +order as they have been received originally. + +.. warning:: + + Please note that the ``Stash`` can only be used together with actors + that have a deque-based mailbox. For this, configure the + ``mailbox-type`` of the dispatcher to be a deque-based mailbox, such as + ``akka.dispatch.UnboundedDequeBasedMailbox``. + +Here is an example of the ``Stash`` in action: + +.. includecode:: code/docs/actor/ActorDocSpec.scala#stash + +Invoking ``stash()`` adds the current message (the message that the +actor received last) to the actor's stash. It is typically invoked +when handling the default case in the actor's message handler to stash +messages that aren't handled by the other cases. It is illegal to +stash the same message twice; to do so results in a +``IllegalStateException`` being thrown. The stash may also be bounded +in which case invoking ``stash()`` may lead to a capacity violation, +which results in a ``StashOverflowException``. The capacity of the +stash can be configured using the ``stash-capacity`` setting (an ``Int``) of the +dispatcher's configuration. + +Invoking ``unstashAll()`` enqueues messages from the stash to the +actor's mailbox until the capacity of the mailbox (if any) has been +reached (note that messages from the stash are prepended to the +mailbox). In case a bounded mailbox overflows, a +``MessageQueueAppendFailedException`` is thrown. +The stash is guaranteed to be empty after calling ``unstashAll()``. + + + Killing an Actor ================ diff --git a/akka-docs/scala/code/docs/actor/ActorDocSpec.scala b/akka-docs/scala/code/docs/actor/ActorDocSpec.scala index ee05e95d42..108aba33b2 100644 --- a/akka-docs/scala/code/docs/actor/ActorDocSpec.scala +++ b/akka-docs/scala/code/docs/actor/ActorDocSpec.scala @@ -300,6 +300,26 @@ class ActorDocSpec extends AkkaSpec(Map("akka.loglevel" -> "INFO")) { val actor = system.actorOf(Props(new HotSwapActor), name = "hot") } + "using Stash" in { + //#stash + import akka.actor.Stash + class ActorWithProtocol extends Actor with Stash { + def receive = { + case "open" ⇒ + unstashAll() + context.become { + case "write" ⇒ // do writing... + case "close" ⇒ + unstashAll() + context.unbecome() + case msg ⇒ stash() + } + case msg ⇒ stash() + } + } + //#stash + } + "using watch" in { //#watch import akka.actor.{ Actor, Props, Terminated } From 11eaac6d66f9757ad35e31cfbc105922ad5fe76e Mon Sep 17 00:00:00 2001 From: phaller Date: Fri, 22 Jun 2012 17:50:45 +0200 Subject: [PATCH 03/39] Stash: add warning about mix-in order --- akka-docs/scala/actors.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/akka-docs/scala/actors.rst b/akka-docs/scala/actors.rst index 15ec6c9054..93d7b45678 100644 --- a/akka-docs/scala/actors.rst +++ b/akka-docs/scala/actors.rst @@ -634,7 +634,7 @@ The `Stash` trait enables an actor to temporarily stash away messages that can not or should not be handled using the actor's current behavior. Upon changing the actor's message handler, i.e., right before invoking ``context.become`` or ``context.unbecome``, all -stashed messages can be "unstashed" by prepending them to the actor's +stashed messages can be "unstashed", thereby prepending them to the actor's mailbox. This way, the stashed messages can be processed in the same order as they have been received originally. @@ -667,6 +667,12 @@ mailbox). In case a bounded mailbox overflows, a ``MessageQueueAppendFailedException`` is thrown. The stash is guaranteed to be empty after calling ``unstashAll()``. +.. warning:: + + Note that the ``Stash`` trait must be mixed into (a subclass of) the + ``Actor`` trait before any trait/class that overrides the ``preRestart`` + callback. This means it's not possible to write + ``Actor with MyActor with Stash`` if ``MyActor`` overrides ``preRestart``. Killing an Actor From e18d591647837dfd43aa9c4b73853edad9971e18 Mon Sep 17 00:00:00 2001 From: phaller Date: Fri, 22 Jun 2012 18:20:29 +0200 Subject: [PATCH 04/39] Stash: add Java docs --- .../docs/actor/UntypedActorDocTestBase.java | 29 +++++++++++++ akka-docs/java/untyped-actors.rst | 42 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java b/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java index c82ce30661..c2fb455cfb 100644 --- a/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java +++ b/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java @@ -50,6 +50,10 @@ import java.util.concurrent.TimeUnit; import java.util.ArrayList; //#import-askPipe +//#import-stash +import akka.actor.UntypedActorWithStash; +//#import-stash + import akka.actor.Props; import akka.actor.UntypedActor; import akka.actor.UntypedActorFactory; @@ -346,6 +350,31 @@ public class UntypedActorDocTestBase { //#hot-swap-actor + //#stash + public static class ActorWithProtocol extends UntypedActorWithStash { + private Boolean isOpen = false; + public void onReceive(Object msg) { + if (isOpen) { + if (msg.equals("write")) { + // do writing... + } else if (msg.equals("close")) { + unstashAll(); + isOpen = false; + } else { + stash(); + } + } else { + if (msg.equals("open")) { + unstashAll(); + isOpen = true; + } else { + stash(); + } + } + } + } + //#stash + //#watch public static class WatchActor extends UntypedActor { final ActorRef child = this.getContext().actorOf(Props.empty(), "child"); diff --git a/akka-docs/java/untyped-actors.rst b/akka-docs/java/untyped-actors.rst index 31a0df9674..c99b5f2984 100644 --- a/akka-docs/java/untyped-actors.rst +++ b/akka-docs/java/untyped-actors.rst @@ -565,6 +565,48 @@ well. Use the ``getContext().unbecome`` method from within the Actor. if (message.equals("revert")) getContext().unbecome(); } + +Stash +===== + +The ``UntypedActorWithStash`` class enables an actor to temporarily stash away messages +that can not or should not be handled using the actor's current +behavior. Upon changing the actor's message handler, i.e., right +before invoking ``getContext().become()`` or ``getContext().unbecome()``, all +stashed messages can be "unstashed", thereby prepending them to the actor's +mailbox. This way, the stashed messages can be processed in the same +order as they have been received originally. + +.. warning:: + + Please note that the stash can only be used together with actors + that have a deque-based mailbox. For this, configure the + ``mailbox-type`` of the dispatcher to be a deque-based mailbox, such as + ``akka.dispatch.UnboundedDequeBasedMailbox``. + +Here is an example of the ``UntypedActorWithStash`` class in action: + +.. includecode:: code/docs/actor/UntypedActorDocTestBase.java#stash + +Invoking ``stash()`` adds the current message (the message that the +actor received last) to the actor's stash. It is typically invoked +when handling the default case in the actor's message handler to stash +messages that aren't handled by the other cases. It is illegal to +stash the same message twice; to do so results in a +``IllegalStateException`` being thrown. The stash may also be bounded +in which case invoking ``stash()`` may lead to a capacity violation, +which results in a ``StashOverflowException``. The capacity of the +stash can be configured using the ``stash-capacity`` setting (an ``Int``) of the +dispatcher's configuration. + +Invoking ``unstashAll()`` enqueues messages from the stash to the +actor's mailbox until the capacity of the mailbox (if any) has been +reached (note that messages from the stash are prepended to the +mailbox). In case a bounded mailbox overflows, a +``MessageQueueAppendFailedException`` is thrown. +The stash is guaranteed to be empty after calling ``unstashAll()``. + + Killing an Actor ================ From be74eb835b4cf425d08c36265f14f9ebe6750a05 Mon Sep 17 00:00:00 2001 From: Roland Date: Mon, 25 Jun 2012 19:30:13 +0200 Subject: [PATCH 05/39] stashin commit so Iulian can play with it --- .gitignore | 1 + akka-actor/src/main/java/akka/japi/JAPI.java | 11 + .../src/main/scala/akka/japi/JavaAPI.scala | 47 ++ .../code/docs/testkit/TestKitDocTest.java | 131 ++++ akka-docs/java/testing.rst | 701 +++++++++++++++++- .../code/docs/testkit/TestkitDocSpec.scala | 7 +- akka-docs/scala/testing.rst | 54 +- .../scala/akka/testkit/TestActorRef.scala | 5 + .../src/main/scala/akka/testkit/TestKit.scala | 17 + project/AkkaBuild.scala | 6 +- 10 files changed, 935 insertions(+), 45 deletions(-) create mode 100644 akka-actor/src/main/java/akka/japi/JAPI.java create mode 100644 akka-docs/java/code/docs/testkit/TestKitDocTest.java diff --git a/.gitignore b/.gitignore index dfeca38166..9699e18be5 100755 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,4 @@ mongoDB/ redis/ beanstalk/ .scalastyle +bin/ diff --git a/akka-actor/src/main/java/akka/japi/JAPI.java b/akka-actor/src/main/java/akka/japi/JAPI.java new file mode 100644 index 0000000000..4808b3e725 --- /dev/null +++ b/akka-actor/src/main/java/akka/japi/JAPI.java @@ -0,0 +1,11 @@ +package akka.japi; + +import scala.collection.Seq; + +public class JAPI { + + public static Seq seq(T... ts) { + return Util.arrayToSeq(ts); + } + +} diff --git a/akka-actor/src/main/scala/akka/japi/JavaAPI.scala b/akka-actor/src/main/scala/akka/japi/JavaAPI.scala index b0db141aee..bc8c3c6ff9 100644 --- a/akka-actor/src/main/scala/akka/japi/JavaAPI.scala +++ b/akka-actor/src/main/scala/akka/japi/JavaAPI.scala @@ -5,6 +5,7 @@ package akka.japi import scala.Some +import scala.util.control.NoStackTrace /** * A Function interface. Used to create first-class-functions is Java. @@ -44,6 +45,50 @@ trait Creator[T] { def create(): T } +object PurePartialFunction { + case object NoMatch extends RuntimeException with NoStackTrace +} + +/** + * Helper for implementing a *pure* partial function: it will possibly be + * invoked multiple times for a single “application”, because its only abstract + * method is used for both isDefinedAt() and apply(); the former is mapped to + * `isCheck == true` and the latter to `isCheck == false` for those cases where + * this is important to know. + * + * {{{ + * new PurePartialFunction() { + * public String apply(Object in, boolean isCheck) { + * if (in instanceof TheThing) { + * if (isCheck) return null; // to spare the expensive or side-effecting code + * return doSomethingWithTheThing((TheThing) in); + * } else { + * throw noMatch(); + * } + * } + * } + * }}} + * + * The typical use of partial functions from Akka looks like the following: + * + * {{{ + * if (pf.isDefinedAt(x)) pf.apply(x) + * }}} + * + * i.e. it will first call `PurePartialFunction.apply(x, true)` and if that + * does not throw `noMatch()` it will continue with calling + * `PurePartialFunction.apply(x, false)`. + */ +abstract class PurePartialFunction[A, B] extends scala.runtime.AbstractFunction1[A, B] with PartialFunction[A, B] { + import PurePartialFunction._ + + def apply(x: A, isCheck: Boolean): B + + final def isDefinedAt(x: A): Boolean = try { apply(x, true); true } catch { case NoMatch ⇒ false } + final def apply(x: A): B = try apply(x, false) catch { case NoMatch ⇒ throw new MatchError } + final def noMatch(): RuntimeException = NoMatch +} + /** * This class represents optional values. Instances of Option * are either instances of case class Some or it is case @@ -117,4 +162,6 @@ object Util { * Given a Class returns a Scala Manifest of that Class */ def manifest[T](clazz: Class[T]): Manifest[T] = Manifest.classType(clazz) + + def arrayToSeq[T](arr: Array[T]): Seq[T] = arr.toSeq } diff --git a/akka-docs/java/code/docs/testkit/TestKitDocTest.java b/akka-docs/java/code/docs/testkit/TestKitDocTest.java new file mode 100644 index 0000000000..d71f086104 --- /dev/null +++ b/akka-docs/java/code/docs/testkit/TestKitDocTest.java @@ -0,0 +1,131 @@ +/* + * + */ +package docs.testkit; + +import static org.junit.Assert.*; + +import org.junit.AfterClass; +import org.junit.Test; + +import akka.actor.ActorSystem; +import akka.actor.Props; +import akka.actor.UntypedActor; +import akka.dispatch.Await; +import akka.dispatch.Future; +import akka.japi.JAPI; +import akka.japi.PurePartialFunction; +import akka.testkit.TestActorRef; +import akka.testkit.TestKit; +import akka.util.Duration; + +public class TestKitDocTest { + + //#test-actor-ref + static class MyActor extends UntypedActor { + public void onReceive(Object o) throws Exception { + if (o.equals("say42")) { + getSender().tell(42, getSelf()); + } else if (o instanceof Exception) { + throw (Exception) o; + } + } + public boolean testMe() { return true; } + } + + //#test-actor-ref + + private static ActorSystem system; + + public TestKitDocTest() { + system = ActorSystem.create(); + } + + @AfterClass + public static void cleanup() { + system.shutdown(); + } + + //#test-actor-ref + @Test + public void demonstrateTestActorRef() { + final Props props = new Props(MyActor.class); + final TestActorRef ref = TestActorRef.apply(props, system); + final MyActor actor = ref.underlyingActor(); + assertTrue(actor.testMe()); + } + //#test-actor-ref + + //#test-behavior + @Test + public void demonstrateAsk() throws Exception { + final Props props = new Props(MyActor.class); + final TestActorRef ref = TestActorRef.apply(props, system); + final Future future = akka.pattern.Patterns.ask(ref, "say42", 3000); + assertTrue(future.isCompleted()); + assertEquals(42, Await.result(future, Duration.Zero())); + } + //#test-behavior + + //#test-expecting-exceptions + @Test + public void demonstrateExceptions() { + final Props props = new Props(MyActor.class); + final TestActorRef ref = TestActorRef.apply(props, system); + try { + ref.receive(new Exception("expected")); + fail("expected an exception to be thrown"); + } catch (Exception e) { + assertEquals("expected", e.getMessage()); + } + } + //#test-expecting-exceptions + + //#test-within + @Test + public void demonstrateWithin() { + new TestKit(system) {{ + testActor().tell(42); + new Within(Duration.parse("1 second")) { + // do not put code outside this method, will run afterwards + public void run() { + assertEquals((Integer) 42, expectMsgClass(Integer.class)); + } + }; + }}; + } + //#test-within + + @Test + public void demonstrateExpectMsgPF() { + new TestKit(system) {{ + testActor().tell(42); + //#test-expect-pf + final String out = expectMsgPF(Duration.parse("1 second"), "fourty-two", + new PurePartialFunction() { + public String apply(Object in, boolean isCheck) { + if (Integer.valueOf(42).equals(in)) { + return "match"; + } else { + throw noMatch(); + } + } + } + ); + assertEquals("match", out); + //#test-expect-pf + testActor().tell("world"); + //#test-expect-anyof + final String any = expectMsgAnyOf(remaining(), JAPI.seq("hello", "world")); + //#test-expect-anyof + assertEquals("world", any); + testActor().tell("world"); + //#test-expect-anyclassof + @SuppressWarnings("unchecked") + final String anyClass = expectMsgAnyClassOf(remaining(), JAPI.>seq(String.class)); + //#test-expect-anyclassof + assertEquals("world", any); + }}; + } + +} diff --git a/akka-docs/java/testing.rst b/akka-docs/java/testing.rst index d49ba2512f..6aa31ff633 100644 --- a/akka-docs/java/testing.rst +++ b/akka-docs/java/testing.rst @@ -4,11 +4,696 @@ Testing Actor Systems (Java) ############################## -Due to the conciseness of test DSLs available for Scala, it may be a good idea -to write the test suite in that language even if the main project is written in -Java. If that is not desirable, you can also use :class:`TestKit` and friends -from Java, albeit with more verbose syntax Munish Gupta has `published a nice -post `_ -showing several patterns you may find useful, and for reference documentation -please refer to :ref:`akka-testkit` until that section has been ported over to -cover Java in full. +As with any piece of software, automated tests are a very important part of the +development cycle. The actor model presents a different view on how units of +code are delimited and how they interact, which has an influence on how to +perform tests. + +.. note:: + + Due to the conciseness of test DSLs available for Scala (`ScalaTest`_, + `Specs2`_, `ScalaCheck`_), it may be a good idea to write the test suite in + that language even if the main project is written in Java. If that is not + desirable, you can also use :class:`TestKit` and friends from Java, albeit + with more verbose syntax which is covered below. Munish Gupta has `published + a nice post + `_ showing + several patterns you may find useful. + +.. _ScalaTest: http://scalatest.org/ +.. _Specs2: http://specs2.org/ +.. _ScalaCheck: http://code.google.com/p/scalacheck/ + +Akka comes with a dedicated module :mod:`akka-testkit` for supporting tests at +different levels, which fall into two clearly distinct categories: + + - Testing isolated pieces of code without involving the actor model, meaning + without multiple threads; this implies completely deterministic behavior + concerning the ordering of events and no concurrency concerns and will be + called **Unit Testing** in the following. + - Testing (multiple) encapsulated actors including multi-threaded scheduling; + this implies non-deterministic order of events but shielding from + concurrency concerns by the actor model and will be called **Integration + Testing** in the following. + +There are of course variations on the granularity of tests in both categories, +where unit testing reaches down to white-box tests and integration testing can +encompass functional tests of complete actor networks. The important +distinction lies in whether concurrency concerns are part of the test or not. +The tools offered are described in detail in the following sections. + +.. note:: + + Be sure to add the module :mod:`akka-testkit` to your dependencies. + +Unit Testing with :class:`TestActorRef` +======================================= + +Testing the business logic inside :class:`Actor` classes can be divided into +two parts: first, each atomic operation must work in isolation, then sequences +of incoming events must be processed correctly, even in the presence of some +possible variability in the ordering of events. The former is the primary use +case for single-threaded unit testing, while the latter can only be verified in +integration tests. + +Normally, the :class:`ActorRef` shields the underlying :class:`Actor` instance +from the outside, the only communications channel is the actor's mailbox. This +restriction is an impediment to unit testing, which led to the inception of the +:class:`TestActorRef`. This special type of reference is designed specifically +for test purposes and allows access to the actor in two ways: either by +obtaining a reference to the underlying actor instance, or by invoking or +querying the actor's behaviour (:meth:`receive`). Each one warrants its own +section below. + +Obtaining a Reference to an :class:`Actor` +------------------------------------------ + +Having access to the actual :class:`Actor` object allows application of all +traditional unit testing techniques on the contained methods. Obtaining a +reference is done like this: + +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-actor-ref + +Since :class:`TestActorRef` is generic in the actor type it returns the +underlying actor with its proper static type. From this point on you may bring +any unit testing tool to bear on your actor as usual. + +Testing the Actor's Behavior +---------------------------- + +When the dispatcher invokes the processing behavior of an actor on a message, +it actually calls :meth:`apply` on the current behavior registered for the +actor. This starts out with the return value of the declared :meth:`receive` +method, but it may also be changed using :meth:`become` and :meth:`unbecome` in +response to external messages. All of this contributes to the overall actor +behavior and it does not lend itself to easy testing on the :class:`Actor` +itself. Therefore the :class:`TestActorRef` offers a different mode of +operation to complement the :class:`Actor` testing: it supports all operations +also valid on normal :class:`ActorRef`. Messages sent to the actor are +processed synchronously on the current thread and answers may be sent back as +usual. This trick is made possible by the :class:`CallingThreadDispatcher` +described below (see `CallingThreadDispatcher`_); this dispatcher is set +implicitly for any actor instantiated into a :class:`TestActorRef`. + +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-behavior + +As the :class:`TestActorRef` is a subclass of :class:`LocalActorRef` with a few +special extras, also aspects like supervision and restarting work properly, but +beware that execution is only strictly synchronous as long as all actors +involved use the :class:`CallingThreadDispatcher`. As soon as you add elements +which include more sophisticated scheduling you leave the realm of unit testing +as you then need to think about asynchronicity again (in most cases the problem +will be to wait until the desired effect had a chance to happen). + +One more special aspect which is overridden for single-threaded tests is the +:meth:`receiveTimeout`, as including that would entail asynchronous queuing of +:obj:`ReceiveTimeout` messages, violating the synchronous contract. + +.. warning:: + + To summarize: :class:`TestActorRef` overwrites two fields: it sets the + dispatcher to :obj:`CallingThreadDispatcher.global` and it sets the + :obj:`receiveTimeout` to None. + +The Way In-Between: Expecting Exceptions +---------------------------------------- + +If you want to test the actor behavior, including hotswapping, but without +involving a dispatcher and without having the :class:`TestActorRef` swallow +any thrown exceptions, then there is another mode available for you: just use +the :meth:`receive` method :class:`TestActorRef`, which will be forwarded to the +underlying actor: + +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-expecting-exceptions + +Use Cases +--------- + +You may of course mix and match both modi operandi of :class:`TestActorRef` as +suits your test needs: + + - one common use case is setting up the actor into a specific internal state + before sending the test message + - another is to verify correct internal state transitions after having sent + the test message + +Feel free to experiment with the possibilities, and if you find useful +patterns, don't hesitate to let the Akka forums know about them! Who knows, +common operations might even be worked into nice DSLs. + +Integration Testing with :class:`TestKit` +========================================= + +When you are reasonably sure that your actor's business logic is correct, the +next step is verifying that it works correctly within its intended environment +(if the individual actors are simple enough, possibly because they use the +:mod:`FSM` module, this might also be the first step). The definition of the +environment depends of course very much on the problem at hand and the level at +which you intend to test, ranging for functional/integration tests to full +system tests. The minimal setup consists of the test procedure, which provides +the desired stimuli, the actor under test, and an actor receiving replies. +Bigger systems replace the actor under test with a network of actors, apply +stimuli at varying injection points and arrange results to be sent from +different emission points, but the basic principle stays the same in that a +single procedure drives the test. + +The :class:`TestKit` class contains a collection of tools which makes this +common task easy. + +.. includecode:: code/docs/testkit/PlainWordTest.java#plain-spec + +The :class:`TestKit` contains an actor named :obj:`testActor` which is the +entry point for messages to be examined with the various ``expectMsg...`` +assertions detailed below. When mixing in the trait ``ImplicitSender`` this +test actor is implicitly used as sender reference when dispatching messages +from the test procedure. The :obj:`testActor` may also be passed to +other actors as usual, usually subscribing it as notification listener. There +is a whole set of examination methods, e.g. receiving all consecutive messages +matching certain criteria, receiving a whole sequence of fixed messages or +classes, receiving nothing for some time, etc. + +The ActorSystem passed in to the constructor of TestKit is accessible via the +:meth:`system()` method. Remember to shut down the actor system after the test +is finished (also in case of failure) so that all actors—including the test +actor—are stopped. + +Built-In Assertions +------------------- + +The above mentioned :meth:`expectMsg` is not the only method for formulating +assertions concerning received messages. Here is the full list: + + * :meth:` T expectMsg(Duration d, T msg): T` + + The given message object must be received within the specified time; the + object will be returned. + + * :meth:` T expectMsgPF(Duration d, PartialFunction pf)` + + Within the given time period, a message must be received and the given + partial function must be defined for that message; the result from applying + the partial function to the received message is returned. + + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-expect-pf + + * :meth:` T expectMsgClass(Duration d, Class c)` + + An object which is an instance of the given :class:`Class` must be received + within the allotted time frame; the object will be returned. Note that this + does a conformance check; if you need the class to be equal, have a look at + :meth:`expectMsgAllClassOf` with a single given class argument. + + * :meth:` T expectMsgAnyOf(Duration d, Seq obj)` + + An object must be received within the given time, and it must be equal ( + compared with ``equals()``) to at least one of the passed reference objects; the + received object will be returned. + + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-expect-anyof + + * :meth:` T expectMsgAnyClassOf(Duration d, Seq> classes)` + + An object must be received within the given time, and it must be an + instance of at least one of the supplied :class:`Class` objects; the + received object will be returned. + + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-expect-anyclassof + + * :meth:`expectMsgAllOf[T](d: Duration, obj: T*): Seq[T]` + + A number of objects matching the size of the supplied object array must be + received within the given time, and for each of the given objects there + must exist at least one among the received ones which equals (compared with + ``==``) it. The full sequence of received objects is returned. + + * :meth:`expectMsgAllClassOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` + + A number of objects matching the size of the supplied :class:`Class` array + must be received within the given time, and for each of the given classes + there must exist at least one among the received objects whose class equals + (compared with ``==``) it (this is *not* a conformance check). The full + sequence of received objects is returned. + + * :meth:`expectMsgAllConformingOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` + + A number of objects matching the size of the supplied :class:`Class` array + must be received within the given time, and for each of the given classes + there must exist at least one among the received objects which is an + instance of this class. The full sequence of received objects is returned. + + * :meth:`expectNoMsg(d: Duration)` + + No message must be received within the given time. This also fails if a + message has been received before calling this method which has not been + removed from the queue using one of the other methods. + + * :meth:`receiveN(n: Int, d: Duration): Seq[AnyRef]` + + ``n`` messages must be received within the given time; the received + messages are returned. + + * :meth:`fishForMessage(max: Duration, hint: String)(pf: PartialFunction[Any, Boolean]): Any` + + Keep receiving messages as long as the time is not used up and the partial + function matches and returns ``false``. Returns the message received for + which it returned ``true`` or throws an exception, which will include the + provided hint for easier debugging. + +In addition to message reception assertions there are also methods which help +with message flows: + + * :meth:`receiveOne(d: Duration): AnyRef` + + Tries to receive one message for at most the given time interval and + returns ``null`` in case of failure. If the given Duration is zero, the + call is non-blocking (polling mode). + + * :meth:`receiveWhile[T](max: Duration, idle: Duration, messages: Int)(pf: PartialFunction[Any, T]): Seq[T]` + + Collect messages as long as + + * they are matching the given partial function + * the given time interval is not used up + * the next message is received within the idle timeout + * the number of messages has not yet reached the maximum + + All collected messages are returned. The maximum duration defaults to the + time remaining in the innermost enclosing :ref:`within ` + block and the idle duration defaults to infinity (thereby disabling the + idle timeout feature). The number of expected messages defaults to + ``Int.MaxValue``, which effectively disables this limit. + + * :meth:`awaitCond(p: => Boolean, max: Duration, interval: Duration)` + + Poll the given condition every :obj:`interval` until it returns ``true`` or + the :obj:`max` duration is used up. The interval defaults to 100 ms and the + maximum defaults to the time remaining in the innermost enclosing + :ref:`within ` block. + + * :meth:`ignoreMsg(pf: PartialFunction[AnyRef, Boolean])` + + :meth:`ignoreNoMsg` + + The internal :obj:`testActor` contains a partial function for ignoring + messages: it will only enqueue messages which do not match the function or + for which the function returns ``false``. This function can be set and + reset using the methods given above; each invocation replaces the previous + function, they are not composed. + + This feature is useful e.g. when testing a logging system, where you want + to ignore regular messages and are only interested in your specific ones. + +Expecting Exceptions +-------------------- + +Since an integration test does not allow to the internal processing of the +participating actors, verifying expected exceptions cannot be done directly. +Instead, use the logging system for this purpose: replacing the normal event +handler with the :class:`TestEventListener` and using an :class:`EventFilter` +allows assertions on log messages, including those which are generated by +exceptions: + +.. includecode:: code/docs/testkit/TestKitDocTest.java#event-filter + +.. _TestKit.within: + +Timing Assertions +----------------- + +Another important part of functional testing concerns timing: certain events +must not happen immediately (like a timer), others need to happen before a +deadline. Therefore, all examination methods accept an upper time limit within +the positive or negative result must be obtained. Lower time limits need to be +checked external to the examination, which is facilitated by a new construct +for managing time constraints: + +.. code-block:: scala + + within([min, ]max) { + ... + } + +The block given to :meth:`within` must complete after a :ref:`Duration` which +is between :obj:`min` and :obj:`max`, where the former defaults to zero. The +deadline calculated by adding the :obj:`max` parameter to the block's start +time is implicitly available within the block to all examination methods, if +you do not specify it, is is inherited from the innermost enclosing +:meth:`within` block. + +It should be noted that if the last message-receiving assertion of the block is +:meth:`expectNoMsg` or :meth:`receiveWhile`, the final check of the +:meth:`within` is skipped in order to avoid false positives due to wake-up +latencies. This means that while individual contained assertions still use the +maximum time bound, the overall block may take arbitrarily longer in this case. + +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-within + +.. note:: + + All times are measured using ``System.nanoTime``, meaning that they describe + wall time, not CPU time. + +Ray Roestenburg has written a great article on using the TestKit: +``_. +His full example is also available :ref:`here `. + +Accounting for Slow Test Systems +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The tight timeouts you use during testing on your lightning-fast notebook will +invariably lead to spurious test failures on the heavily loaded Jenkins server +(or similar). To account for this situation, all maximum durations are +internally scaled by a factor taken from the :ref:`configuration`, +``akka.test.timefactor``, which defaults to 1. + +You can scale other durations with the same factor by using the implicit conversion +in ``akka.testkit`` package object to add dilated function to :class:`Duration`. + +.. includecode:: code/docs/testkit/TestKitDocTest.java#duration-dilation + +Resolving Conflicts with Implicit ActorRef +------------------------------------------ + +If you want the sender of messages inside your TestKit-based tests to be the ``testActor`` +simply mix in ``ÌmplicitSender`` into your test. + +.. includecode:: code/docs/testkit/PlainWordSpec.scala#implicit-sender + +Using Multiple Probe Actors +--------------------------- + +When the actors under test are supposed to send various messages to different +destinations, it may be difficult distinguishing the message streams arriving +at the :obj:`testActor` when using the :class:`TestKit` as a mixin. Another +approach is to use it for creation of simple probe actors to be inserted in the +message flows. To make this more powerful and convenient, there is a concrete +implementation called :class:`TestProbe`. The functionality is best explained +using a small example: + +.. includecode:: code/docs/testkit/TestKitDocTest.java + :include: imports-test-probe,my-double-echo,test-probe + +Here a the system under test is simulated by :class:`MyDoubleEcho`, which is +supposed to mirror its input to two outputs. Attaching two test probes enables +verification of the (simplistic) behavior. Another example would be two actors +A and B which collaborate by A sending messages to B. In order to verify this +message flow, a :class:`TestProbe` could be inserted as target of A, using the +forwarding capabilities or auto-pilot described below to include a real B in +the test setup. + +Probes may also be equipped with custom assertions to make your test code even +more concise and clear: + +.. includecode:: code/docs/testkit/TestKitDocTest.java + :include: test-special-probe + +You have complete flexibility here in mixing and matching the :class:`TestKit` +facilities with your own checks and choosing an intuitive name for it. In real +life your code will probably be a bit more complicated than the example given +above; just use the power! + +Replying to Messages Received by Probes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The probes keep track of the communications channel for replies, if possible, +so they can also reply: + +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-probe-reply + +Forwarding Messages Received by Probes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Given a destination actor ``dest`` which in the nominal actor network would +receive a message from actor ``source``. If you arrange for the message to be +sent to a :class:`TestProbe` ``probe`` instead, you can make assertions +concerning volume and timing of the message flow while still keeping the +network functioning: + +.. includecode:: code/docs/testkit/TestKitDocTest.java + :include: test-probe-forward-actors,test-probe-forward + +The ``dest`` actor will receive the same message invocation as if no test probe +had intervened. + +Auto-Pilot +^^^^^^^^^^ + +Receiving messages in a queue for later inspection is nice, but in order to +keep a test running and verify traces later you can also install an +:class:`AutoPilot` in the participating test probes (actually in any +:class:`TestKit`) which is invoked before enqueueing to the inspection queue. +This code can be used to forward messages, e.g. in a chain ``A --> Probe --> +B``, as long as a certain protocol is obeyed. + +.. includecode:: ../../akka-testkit/src/test/scala/akka/testkit/TestProbeSpec.scala#autopilot + +The :meth:`run` method must return the auto-pilot for the next message, wrapped +in an :class:`Option`; setting it to :obj:`None` terminates the auto-pilot. + +Caution about Timing Assertions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The behavior of :meth:`within` blocks when using test probes might be perceived +as counter-intuitive: you need to remember that the nicely scoped deadline as +described :ref:`above ` is local to each probe. Hence, probes +do not react to each other's deadlines or to the deadline set in an enclosing +:class:`TestKit` instance:: + + class SomeTest extends TestKit(_system: ActorSystem) with ImplicitSender { + + val probe = TestProbe() + + within(100 millis) { + probe.expectMsg("hallo") // Will hang forever! + } + } + +This test will hang indefinitely, because the :meth:`expectMsg` call does not +see any deadline. Currently, the only option is to use ``probe.within`` in the +above code to make it work; later versions may include lexically scoped +deadlines using implicit arguments. + +.. _TestCallingThreadDispatcherRef: + +CallingThreadDispatcher +======================= + +The :class:`CallingThreadDispatcher` serves good purposes in unit testing, as +described above, but originally it was conceived in order to allow contiguous +stack traces to be generated in case of an error. As this special dispatcher +runs everything which would normally be queued directly on the current thread, +the full history of a message's processing chain is recorded on the call stack, +so long as all intervening actors run on this dispatcher. + +How to use it +------------- + +Just set the dispatcher as you normally would: + +.. includecode:: code/docs/testkit/TestKitDocTest.java#calling-thread-dispatcher + +How it works +------------ + +When receiving an invocation, the :class:`CallingThreadDispatcher` checks +whether the receiving actor is already active on the current thread. The +simplest example for this situation is an actor which sends a message to +itself. In this case, processing cannot continue immediately as that would +violate the actor model, so the invocation is queued and will be processed when +the active invocation on that actor finishes its processing; thus, it will be +processed on the calling thread, but simply after the actor finishes its +previous work. In the other case, the invocation is simply processed +immediately on the current thread. Futures scheduled via this dispatcher are +also executed immediately. + +This scheme makes the :class:`CallingThreadDispatcher` work like a general +purpose dispatcher for any actors which never block on external events. + +In the presence of multiple threads it may happen that two invocations of an +actor running on this dispatcher happen on two different threads at the same +time. In this case, both will be processed directly on their respective +threads, where both compete for the actor's lock and the loser has to wait. +Thus, the actor model is left intact, but the price is loss of concurrency due +to limited scheduling. In a sense this is equivalent to traditional mutex style +concurrency. + +The other remaining difficulty is correct handling of suspend and resume: when +an actor is suspended, subsequent invocations will be queued in thread-local +queues (the same ones used for queuing in the normal case). The call to +:meth:`resume`, however, is done by one specific thread, and all other threads +in the system will probably not be executing this specific actor, which leads +to the problem that the thread-local queues cannot be emptied by their native +threads. Hence, the thread calling :meth:`resume` will collect all currently +queued invocations from all threads into its own queue and process them. + +Limitations +----------- + +If an actor's behavior blocks on a something which would normally be affected +by the calling actor after having sent the message, this will obviously +dead-lock when using this dispatcher. This is a common scenario in actor tests +based on :class:`CountDownLatch` for synchronization: + +.. code-block:: scala + + val latch = new CountDownLatch(1) + actor ! startWorkAfter(latch) // actor will call latch.await() before proceeding + doSomeSetupStuff() + latch.countDown() + +The example would hang indefinitely within the message processing initiated on +the second line and never reach the fourth line, which would unblock it on a +normal dispatcher. + +Thus, keep in mind that the :class:`CallingThreadDispatcher` is not a +general-purpose replacement for the normal dispatchers. On the other hand it +may be quite useful to run your actor network on it for testing, because if it +runs without dead-locking chances are very high that it will not dead-lock in +production. + +.. warning:: + + The above sentence is unfortunately not a strong guarantee, because your + code might directly or indirectly change its behavior when running on a + different dispatcher. If you are looking for a tool to help you debug + dead-locks, the :class:`CallingThreadDispatcher` may help with certain error + scenarios, but keep in mind that it has may give false negatives as well as + false positives. + +Benefits +-------- + +To summarize, these are the features with the :class:`CallingThreadDispatcher` +has to offer: + + - Deterministic execution of single-threaded tests while retaining nearly full + actor semantics + - Full message processing history leading up to the point of failure in + exception stack traces + - Exclusion of certain classes of dead-lock scenarios + +.. _actor.logging: + +Tracing Actor Invocations +========================= + +The testing facilities described up to this point were aiming at formulating +assertions about a system’s behavior. If a test fails, it is usually your job +to find the cause, fix it and verify the test again. This process is supported +by debuggers as well as logging, where the Akka toolkit offers the following +options: + +* *Logging of exceptions thrown within Actor instances* + + This is always on; in contrast to the other logging mechanisms, this logs at + ``ERROR`` level. + +* *Logging of message invocations on certain actors* + + This is enabled by a setting in the :ref:`configuration` — namely + ``akka.actor.debug.receive`` — which enables the :meth:`loggable` + statement to be applied to an actor’s :meth:`receive` function: + +.. includecode:: code/docs/testkit/TestKitDocTest.java#logging-receive + +. + If the abovementioned setting is not given in the :ref:`configuration`, this method will + pass through the given :class:`Receive` function unmodified, meaning that + there is no runtime cost unless actually enabled. + + The logging feature is coupled to this specific local mark-up because + enabling it uniformly on all actors is not usually what you need, and it + would lead to endless loops if it were applied to :class:`EventHandler` + listeners. + +* *Logging of special messages* + + Actors handle certain special messages automatically, e.g. :obj:`Kill`, + :obj:`PoisonPill`, etc. Tracing of these message invocations is enabled by + the setting ``akka.actor.debug.autoreceive``, which enables this on all + actors. + +* *Logging of the actor lifecycle* + + Actor creation, start, restart, monitor start, monitor stop and stop may be traced by + enabling the setting ``akka.actor.debug.lifecycle``; this, too, is enabled + uniformly on all actors. + +All these messages are logged at ``DEBUG`` level. To summarize, you can enable +full logging of actor activities using this configuration fragment:: + + akka { + loglevel = DEBUG + actor { + debug { + receive = on + autoreceive = on + lifecycle = on + } + } + } + +Different Testing Frameworks +============================ + +Akka’s own test suite is written using `ScalaTest`_, +which also shines through in documentation examples. However, the TestKit and +its facilities do not depend on that framework, you can essentially use +whichever suits your development style best. + +This section contains a collection of known gotchas with some other frameworks, +which is by no means exhaustive and does not imply endorsement or special +support. + +When you need it to be a trait +------------------------------ + +If for some reason it is a problem to inherit from :class:`TestKit` due to it +being a concrete class instead of a trait, there’s :class:`TestKitBase`: + +.. includecode:: code/docs/testkit/TestKitDocTest.java + :include: test-kit-base + :exclude: put-your-test-code-here + +The ``implicit lazy val system`` must be declared exactly like that (you can of +course pass arguments to the actor system factory as needed) because trait +:class:`TestKitBase` needs the system during its construction. + +.. warning:: + + Use of the trait is discouraged because of potential issues with binary + backwards compatibility in the future, use at own risk. + +Specs2 +------ + +Some `Specs2`_ users have contributed examples of how to work around some clashes which may arise: + +* Mixing TestKit into :class:`org.specs2.mutable.Specification` results in a + name clash involving the ``end`` method (which is a private variable in + TestKit and an abstract method in Specification); if mixing in TestKit first, + the code may compile but might then fail at runtime. The work-around—which is + actually beneficial also for the third point—is to apply the TestKit together + with :class:`org.specs2.specification.Scope`. +* The Specification traits provide a :class:`Duration` DSL which uses partly + the same method names as :class:`akka.util.Duration`, resulting in ambiguous + implicits if ``akka.util.duration._`` is imported. There are two work-arounds: + + * either use the Specification variant of Duration and supply an implicit + conversion to the Akka Duration. This conversion is not supplied with the + Akka distribution because that would mean that our JAR files would dependon + Specs2, which is not justified by this little feature. + + * or mix :class:`org.specs2.time.NoTimeConversions` into the Specification. + +* Specifications are by default executed concurrently, which requires some care + when writing the tests or alternatively the ``sequential`` keyword. + +You can use the following two examples as guidelines: + +.. includecode:: code/docs/testkit/Specs2DemoSpec.scala + +.. includecode:: code/docs/testkit/Specs2DemoAcceptance.scala + + + diff --git a/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala b/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala index 564b7929ce..fca85cf85f 100644 --- a/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala +++ b/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala @@ -125,7 +125,10 @@ class TestkitDocSpec extends AkkaSpec with DefaultTimeout with ImplicitSender { val actorRef = TestActorRef(new MyActor) // hypothetical message stimulating a '42' answer - val result = Await.result((actorRef ? Say42), 5 seconds).asInstanceOf[Int] + val future = actorRef ? Say42 + val result = future.value.get match { + case Right(x: Int) ⇒ x + } result must be(42) //#test-behavior } @@ -146,7 +149,7 @@ class TestkitDocSpec extends AkkaSpec with DefaultTimeout with ImplicitSender { val actorRef = TestActorRef(new Actor { def receive = { - case boom ⇒ throw new IllegalArgumentException("boom") + case "hello" ⇒ throw new IllegalArgumentException("boom") } }) intercept[IllegalArgumentException] { actorRef.receive("hello") } diff --git a/akka-docs/scala/testing.rst b/akka-docs/scala/testing.rst index 0835db18e7..7a6415492d 100644 --- a/akka-docs/scala/testing.rst +++ b/akka-docs/scala/testing.rst @@ -67,15 +67,6 @@ Since :class:`TestActorRef` is generic in the actor type it returns the underlying actor with its proper static type. From this point on you may bring any unit testing tool to bear on your actor as usual. -Expecting Exceptions --------------------- - -Testing that an expected exception is thrown while processing a message sent to -the actor under test can be done by using a :class:`TestActorRef` :meth:`receive` based -invocation: - -.. includecode:: code/docs/testkit/TestkitDocSpec.scala#test-expecting-exceptions - .. _TestFSMRef: Testing Finite State Machines @@ -111,8 +102,8 @@ operation to complement the :class:`Actor` testing: it supports all operations also valid on normal :class:`ActorRef`. Messages sent to the actor are processed synchronously on the current thread and answers may be sent back as usual. This trick is made possible by the :class:`CallingThreadDispatcher` -described below; this dispatcher is set implicitly for any actor instantiated -into a :class:`TestActorRef`. +described below (see `CallingThreadDispatcher`_); this dispatcher is set +implicitly for any actor instantiated into a :class:`TestActorRef`. .. includecode:: code/docs/testkit/TestkitDocSpec.scala#test-behavior @@ -134,8 +125,8 @@ One more special aspect which is overridden for single-threaded tests is the dispatcher to :obj:`CallingThreadDispatcher.global` and it sets the :obj:`receiveTimeout` to None. -The Way In-Between ------------------- +The Way In-Between: Expecting Exceptions +---------------------------------------- If you want to test the actor behavior, including hotswapping, but without involving a dispatcher and without having the :class:`TestActorRef` swallow @@ -143,10 +134,7 @@ any thrown exceptions, then there is another mode available for you: just use the :meth:`receive` method :class:`TestActorRef`, which will be forwarded to the underlying actor: -.. includecode:: code/docs/testkit/TestkitDocSpec.scala#test-unhandled - -The above sample assumes the default behavior for unhandled messages, i.e. -that the actor doesn't swallow all messages and doesn't override :meth:`unhandled`. +.. includecode:: code/docs/testkit/TestkitDocSpec.scala#test-expecting-exceptions Use Cases --------- @@ -205,12 +193,12 @@ Built-In Assertions The above mentioned :meth:`expectMsg` is not the only method for formulating assertions concerning received messages. Here is the full list: - * :meth:`expectMsg[T](d: Duration, msg: T): T` + * :meth:`expectMsg[T](d: Duration, msg: T): T` The given message object must be received within the specified time; the object will be returned. - * :meth:`expectMsgPF[T](d: Duration)(pf: PartialFunction[Any, T]): T` + * :meth:`expectMsgPF[T](d: Duration)(pf: PartialFunction[Any, T]): T` Within the given time period, a message must be received and the given partial function must be defined for that message; the result from applying @@ -219,40 +207,40 @@ assertions concerning received messages. Here is the full list: the deadline from the innermost enclosing :ref:`within ` block instead. - * :meth:`expectMsgClass[T](d: Duration, c: Class[T]): T` + * :meth:`expectMsgClass[T](d: Duration, c: Class[T]): T` An object which is an instance of the given :class:`Class` must be received within the allotted time frame; the object will be returned. Note that this does a conformance check; if you need the class to be equal, have a look at :meth:`expectMsgAllClassOf` with a single given class argument. - * :meth:`expectMsgType[T: Manifest](d: Duration)` + * :meth:`expectMsgType[T: Manifest](d: Duration)` An object which is an instance of the given type (after erasure) must be received within the allotted time frame; the object will be returned. This method is approximately equivalent to ``expectMsgClass(manifest[T].erasure)``. - * :meth:`expectMsgAnyOf[T](d: Duration, obj: T*): T` + * :meth:`expectMsgAnyOf[T](d: Duration, obj: T*): T` An object must be received within the given time, and it must be equal ( compared with ``==``) to at least one of the passed reference objects; the received object will be returned. - * :meth:`expectMsgAnyClassOf[T](d: Duration, obj: Class[_ <: T]*): T` + * :meth:`expectMsgAnyClassOf[T](d: Duration, obj: Class[_ <: T]*): T` An object must be received within the given time, and it must be an instance of at least one of the supplied :class:`Class` objects; the received object will be returned. - * :meth:`expectMsgAllOf[T](d: Duration, obj: T*): Seq[T]` + * :meth:`expectMsgAllOf[T](d: Duration, obj: T*): Seq[T]` A number of objects matching the size of the supplied object array must be received within the given time, and for each of the given objects there must exist at least one among the received ones which equals (compared with ``==``) it. The full sequence of received objects is returned. - * :meth:`expectMsgAllClassOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` + * :meth:`expectMsgAllClassOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` A number of objects matching the size of the supplied :class:`Class` array must be received within the given time, and for each of the given classes @@ -260,25 +248,25 @@ assertions concerning received messages. Here is the full list: (compared with ``==``) it (this is *not* a conformance check). The full sequence of received objects is returned. - * :meth:`expectMsgAllConformingOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` + * :meth:`expectMsgAllConformingOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` A number of objects matching the size of the supplied :class:`Class` array must be received within the given time, and for each of the given classes there must exist at least one among the received objects which is an instance of this class. The full sequence of received objects is returned. - * :meth:`expectNoMsg(d: Duration)` + * :meth:`expectNoMsg(d: Duration)` No message must be received within the given time. This also fails if a message has been received before calling this method which has not been removed from the queue using one of the other methods. - * :meth:`receiveN(n: Int, d: Duration): Seq[AnyRef]` + * :meth:`receiveN(n: Int, d: Duration): Seq[AnyRef]` ``n`` messages must be received within the given time; the received messages are returned. - * :meth:`fishForMessage(max: Duration, hint: String)(pf: PartialFunction[Any, Boolean]): Any` + * :meth:`fishForMessage(max: Duration, hint: String)(pf: PartialFunction[Any, Boolean]): Any` Keep receiving messages as long as the time is not used up and the partial function matches and returns ``false``. Returns the message received for @@ -288,13 +276,13 @@ assertions concerning received messages. Here is the full list: In addition to message reception assertions there are also methods which help with message flows: - * :meth:`receiveOne(d: Duration): AnyRef` + * :meth:`receiveOne(d: Duration): AnyRef` Tries to receive one message for at most the given time interval and returns ``null`` in case of failure. If the given Duration is zero, the call is non-blocking (polling mode). - * :meth:`receiveWhile[T](max: Duration, idle: Duration, messages: Int)(pf: PartialFunction[Any, T]): Seq[T]` + * :meth:`receiveWhile[T](max: Duration, idle: Duration, messages: Int)(pf: PartialFunction[Any, T]): Seq[T]` Collect messages as long as @@ -309,14 +297,14 @@ with message flows: idle timeout feature). The number of expected messages defaults to ``Int.MaxValue``, which effectively disables this limit. - * :meth:`awaitCond(p: => Boolean, max: Duration, interval: Duration)` + * :meth:`awaitCond(p: => Boolean, max: Duration, interval: Duration)` Poll the given condition every :obj:`interval` until it returns ``true`` or the :obj:`max` duration is used up. The interval defaults to 100 ms and the maximum defaults to the time remaining in the innermost enclosing :ref:`within ` block. - * :meth:`ignoreMsg(pf: PartialFunction[AnyRef, Boolean])` + * :meth:`ignoreMsg(pf: PartialFunction[AnyRef, Boolean])` :meth:`ignoreNoMsg` diff --git a/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala b/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala index f8efe4e2e5..b65d836f22 100644 --- a/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala +++ b/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala @@ -132,4 +132,9 @@ object TestActorRef { "\nOR try to change: 'actorOf(Props[MyActor]' to 'actorOf(Props(new MyActor)'.", exception) } }), name) + + /** + * Java API + */ + def create(props: Props, name: String, system: ActorSystem) = apply(props, name)(system) } diff --git a/akka-testkit/src/main/scala/akka/testkit/TestKit.scala b/akka-testkit/src/main/scala/akka/testkit/TestKit.scala index b3c1626536..c292cc238a 100644 --- a/akka-testkit/src/main/scala/akka/testkit/TestKit.scala +++ b/akka-testkit/src/main/scala/akka/testkit/TestKit.scala @@ -13,6 +13,7 @@ import scala.annotation.tailrec import akka.actor.ActorSystem import akka.util.Timeout import akka.util.BoxedType +import scala.annotation.varargs object TestActor { type Ignore = Option[PartialFunction[AnyRef, Boolean]] @@ -241,6 +242,22 @@ trait TestKitBase { */ def within[T](max: Duration)(f: ⇒ T): T = within(0 seconds, max)(f) + /** + * Java API for within(): + * + * {{{ + * new Within(Duration.parse("3 seconds")) { + * public void run() { + * // your test code here + * } + * } + * }}} + */ + abstract class Within(max: Duration) { + def run(): Unit + within(max)(run()) + } + /** * Same as `expectMsg(remaining, obj)`, but correctly treating the timeFactor. */ diff --git a/project/AkkaBuild.scala b/project/AkkaBuild.scala index 4213f65611..87725d1d5b 100644 --- a/project/AkkaBuild.scala +++ b/project/AkkaBuild.scala @@ -285,7 +285,8 @@ object AkkaBuild extends Build { settings = defaultSettings ++ Sphinx.settings ++ Seq( unmanagedSourceDirectories in Test <<= baseDirectory { _ ** "code" get }, libraryDependencies ++= Dependencies.docs, - unmanagedSourceDirectories in ScalariformKeys.format in Test <<= unmanagedSourceDirectories in Test + unmanagedSourceDirectories in ScalariformKeys.format in Test <<= unmanagedSourceDirectories in Test, + testOptions += Tests.Argument(TestFrameworks.JUnit, "-v") ) ) @@ -473,7 +474,7 @@ object Dependencies { val tutorials = Seq(Test.scalatest, Test.junit) - val docs = Seq(Test.scalatest, Test.junit, Test.specs2) + val docs = Seq(Test.scalatest, Test.junit, Test.specs2, Test.junitIntf) val zeroMQ = Seq(protobuf, Dependency.zeroMQ, Test.scalatest, Test.junit) } @@ -514,6 +515,7 @@ object Dependency { val scalatest = "org.scalatest" % "scalatest_2.9.1" % V.Scalatest % "test" // ApacheV2 val scalacheck = "org.scala-tools.testing" % "scalacheck_2.9.1" % "1.9" % "test" // New BSD val specs2 = "org.specs2" % "specs2_2.9.1" % "1.9" % "test" // Modified BSD / ApacheV2 + val junitIntf = "com.novocode" % "junit-interface" % "0.8" % "test" } } From 370c07b438a4e7193e759db56b12c13915d15ceb Mon Sep 17 00:00:00 2001 From: phaller Date: Thu, 31 May 2012 13:53:03 +0200 Subject: [PATCH 06/39] Correcting example in ScalaDoc for Stash --- akka-actor/src/main/scala/akka/actor/Stash.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/akka-actor/src/main/scala/akka/actor/Stash.scala b/akka-actor/src/main/scala/akka/actor/Stash.scala index 386bc0f070..2415b38618 100644 --- a/akka-actor/src/main/scala/akka/actor/Stash.scala +++ b/akka-actor/src/main/scala/akka/actor/Stash.scala @@ -15,7 +15,8 @@ import akka.AkkaException * class ActorWithProtocol extends Actor with Stash { * def receive = { * case "open" ⇒ - * unstashAll { + * unstashAll() + * context.become { * case "write" ⇒ // do writing... * case "close" ⇒ * unstashAll() From 8b17099f5000d2439706ffaa778b68f3284a5982 Mon Sep 17 00:00:00 2001 From: phaller Date: Tue, 12 Jun 2012 15:51:54 +0200 Subject: [PATCH 07/39] Adding Stash section in Actors docs (including docs for the Java API). Example code added to ActorDocSpec/UntypedActorDocTestBase. --- .../docs/actor/UntypedActorDocTestBase.java | 29 +++++++++++ akka-docs/java/untyped-actors.rst | 42 ++++++++++++++++ akka-docs/scala/actors.rst | 48 +++++++++++++++++++ .../scala/code/docs/actor/ActorDocSpec.scala | 20 ++++++++ 4 files changed, 139 insertions(+) diff --git a/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java b/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java index c82ce30661..c2fb455cfb 100644 --- a/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java +++ b/akka-docs/java/code/docs/actor/UntypedActorDocTestBase.java @@ -50,6 +50,10 @@ import java.util.concurrent.TimeUnit; import java.util.ArrayList; //#import-askPipe +//#import-stash +import akka.actor.UntypedActorWithStash; +//#import-stash + import akka.actor.Props; import akka.actor.UntypedActor; import akka.actor.UntypedActorFactory; @@ -346,6 +350,31 @@ public class UntypedActorDocTestBase { //#hot-swap-actor + //#stash + public static class ActorWithProtocol extends UntypedActorWithStash { + private Boolean isOpen = false; + public void onReceive(Object msg) { + if (isOpen) { + if (msg.equals("write")) { + // do writing... + } else if (msg.equals("close")) { + unstashAll(); + isOpen = false; + } else { + stash(); + } + } else { + if (msg.equals("open")) { + unstashAll(); + isOpen = true; + } else { + stash(); + } + } + } + } + //#stash + //#watch public static class WatchActor extends UntypedActor { final ActorRef child = this.getContext().actorOf(Props.empty(), "child"); diff --git a/akka-docs/java/untyped-actors.rst b/akka-docs/java/untyped-actors.rst index 57dbaa5604..a699cb7145 100644 --- a/akka-docs/java/untyped-actors.rst +++ b/akka-docs/java/untyped-actors.rst @@ -558,6 +558,48 @@ well. Use the ``getContext().unbecome`` method from within the Actor. if (message.equals("revert")) getContext().unbecome(); } + +Stash +===== + +The ``UntypedActorWithStash`` class enables an actor to temporarily stash away messages +that can not or should not be handled using the actor's current +behavior. Upon changing the actor's message handler, i.e., right +before invoking ``getContext().become()`` or ``getContext().unbecome()``, all +stashed messages can be "unstashed", thereby prepending them to the actor's +mailbox. This way, the stashed messages can be processed in the same +order as they have been received originally. + +.. warning:: + + Please note that the stash can only be used together with actors + that have a deque-based mailbox. For this, configure the + ``mailbox-type`` of the dispatcher to be a deque-based mailbox, such as + ``akka.dispatch.UnboundedDequeBasedMailbox`` (see :ref:`dispatchers-java`). + +Here is an example of the ``UntypedActorWithStash`` class in action: + +.. includecode:: code/docs/actor/UntypedActorDocTestBase.java#stash + +Invoking ``stash()`` adds the current message (the message that the +actor received last) to the actor's stash. It is typically invoked +when handling the default case in the actor's message handler to stash +messages that aren't handled by the other cases. It is illegal to +stash the same message twice; to do so results in an +``IllegalStateException`` being thrown. The stash may also be bounded +in which case invoking ``stash()`` may lead to a capacity violation, +which results in a ``StashOverflowException``. The capacity of the +stash can be configured using the ``stash-capacity`` setting (an ``Int``) of the +dispatcher's configuration. + +Invoking ``unstashAll()`` enqueues messages from the stash to the +actor's mailbox until the capacity of the mailbox (if any) has been +reached (note that messages from the stash are prepended to the +mailbox). In case a bounded mailbox overflows, a +``MessageQueueAppendFailedException`` is thrown. +The stash is guaranteed to be empty after calling ``unstashAll()``. + + Killing an Actor ================ diff --git a/akka-docs/scala/actors.rst b/akka-docs/scala/actors.rst index 47a2318e53..d3a53408e2 100644 --- a/akka-docs/scala/actors.rst +++ b/akka-docs/scala/actors.rst @@ -620,6 +620,54 @@ Here's how you use the ``unbecome`` method: } +Stash +===== + +The `Stash` trait enables an actor to temporarily stash away messages +that can not or should not be handled using the actor's current +behavior. Upon changing the actor's message handler, i.e., right +before invoking ``context.become`` or ``context.unbecome``, all +stashed messages can be "unstashed", thereby prepending them to the actor's +mailbox. This way, the stashed messages can be processed in the same +order as they have been received originally. + +.. warning:: + + Please note that the ``Stash`` can only be used together with actors + that have a deque-based mailbox. For this, configure the + ``mailbox-type`` of the dispatcher to be a deque-based mailbox, such as + ``akka.dispatch.UnboundedDequeBasedMailbox`` (see :ref:`dispatchers-scala`). + +Here is an example of the ``Stash`` in action: + +.. includecode:: code/docs/actor/ActorDocSpec.scala#stash + +Invoking ``stash()`` adds the current message (the message that the +actor received last) to the actor's stash. It is typically invoked +when handling the default case in the actor's message handler to stash +messages that aren't handled by the other cases. It is illegal to +stash the same message twice; to do so results in an +``IllegalStateException`` being thrown. The stash may also be bounded +in which case invoking ``stash()`` may lead to a capacity violation, +which results in a ``StashOverflowException``. The capacity of the +stash can be configured using the ``stash-capacity`` setting (an ``Int``) of the +dispatcher's configuration. + +Invoking ``unstashAll()`` enqueues messages from the stash to the +actor's mailbox until the capacity of the mailbox (if any) has been +reached (note that messages from the stash are prepended to the +mailbox). In case a bounded mailbox overflows, a +``MessageQueueAppendFailedException`` is thrown. +The stash is guaranteed to be empty after calling ``unstashAll()``. + +.. warning:: + + Note that the ``Stash`` trait must be mixed into (a subclass of) the + ``Actor`` trait before any trait/class that overrides the ``preRestart`` + callback. This means it's not possible to write + ``Actor with MyActor with Stash`` if ``MyActor`` overrides ``preRestart``. + + Killing an Actor ================ diff --git a/akka-docs/scala/code/docs/actor/ActorDocSpec.scala b/akka-docs/scala/code/docs/actor/ActorDocSpec.scala index ee05e95d42..108aba33b2 100644 --- a/akka-docs/scala/code/docs/actor/ActorDocSpec.scala +++ b/akka-docs/scala/code/docs/actor/ActorDocSpec.scala @@ -300,6 +300,26 @@ class ActorDocSpec extends AkkaSpec(Map("akka.loglevel" -> "INFO")) { val actor = system.actorOf(Props(new HotSwapActor), name = "hot") } + "using Stash" in { + //#stash + import akka.actor.Stash + class ActorWithProtocol extends Actor with Stash { + def receive = { + case "open" ⇒ + unstashAll() + context.become { + case "write" ⇒ // do writing... + case "close" ⇒ + unstashAll() + context.unbecome() + case msg ⇒ stash() + } + case msg ⇒ stash() + } + } + //#stash + } + "using watch" in { //#watch import akka.actor.{ Actor, Props, Terminated } From 8a88edc30269a16b94e29f355bce751e8bb105cc Mon Sep 17 00:00:00 2001 From: phaller Date: Wed, 27 Jun 2012 19:10:21 +0200 Subject: [PATCH 08/39] Stash docs: add paragraphs about performance and restarts --- akka-docs/java/untyped-actors.rst | 8 ++++++++ akka-docs/scala/actors.rst | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/akka-docs/java/untyped-actors.rst b/akka-docs/java/untyped-actors.rst index a699cb7145..b6c63ef15a 100644 --- a/akka-docs/java/untyped-actors.rst +++ b/akka-docs/java/untyped-actors.rst @@ -599,6 +599,14 @@ mailbox). In case a bounded mailbox overflows, a ``MessageQueueAppendFailedException`` is thrown. The stash is guaranteed to be empty after calling ``unstashAll()``. +The stash is backed by a ``scala.collection.immutable.Vector``. As a +result, even a very large number of messages may be stashed without a +major impact on performance. + +Note that the stash is not persisted across restarts of an actor, +unlike the actor's mailbox. Therefore, it should be managed like other +parts of the actor's state which have the same property. + Killing an Actor ================ diff --git a/akka-docs/scala/actors.rst b/akka-docs/scala/actors.rst index d3a53408e2..3c1da0cc55 100644 --- a/akka-docs/scala/actors.rst +++ b/akka-docs/scala/actors.rst @@ -660,6 +660,10 @@ mailbox). In case a bounded mailbox overflows, a ``MessageQueueAppendFailedException`` is thrown. The stash is guaranteed to be empty after calling ``unstashAll()``. +The stash is backed by a ``scala.collection.immutable.Vector``. As a +result, even a very large number of messages may be stashed without a +major impact on performance. + .. warning:: Note that the ``Stash`` trait must be mixed into (a subclass of) the @@ -667,6 +671,10 @@ The stash is guaranteed to be empty after calling ``unstashAll()``. callback. This means it's not possible to write ``Actor with MyActor with Stash`` if ``MyActor`` overrides ``preRestart``. +Note that the stash is not persisted across restarts of an actor, +unlike the actor's mailbox. Therefore, it should be managed like other +parts of the actor's state which have the same property. + Killing an Actor ================ From 535df04dc2aca548f117e5f7c16f240ca20894e8 Mon Sep 17 00:00:00 2001 From: Joshua Gao Date: Wed, 27 Jun 2012 16:16:20 -0700 Subject: [PATCH 09/39] Update polling for compatibility with ZMQ ZMQ 2.0 poll() accepts its duration in microseconds, but 3.0+ accepts milliseconds, causing poll to block for 1000 times as long as it should. --- .../src/main/scala/akka/zeromq/ConcurrentSocketActor.scala | 2 +- akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala b/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala index e1b1ba4ddf..bc3a9c27df 100644 --- a/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala +++ b/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala @@ -176,7 +176,7 @@ private[zeromq] class ConcurrentSocketActor(params: Seq[SocketOption]) extends A val duration = fromConfig getOrElse ZeroMQExtension(context.system).DefaultPollTimeout if (duration > Duration.Zero) { (msg: PollMsg) ⇒ // for positive timeout values, do poll (i.e. block this thread) - poller.poll(duration.toMicros) + ZeroMQExtension(context.system).poll(poller, duration) self ! msg } else { val d = -duration diff --git a/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala b/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala index 4bf52a41e3..70ef399f53 100644 --- a/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala +++ b/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala @@ -4,6 +4,7 @@ package akka.zeromq import org.zeromq.{ ZMQ ⇒ JZMQ } +import org.zeromq.ZMQ.Poller import akka.actor._ import akka.dispatch.{ Await } import akka.pattern.ask @@ -47,6 +48,10 @@ class ZeroMQExtension(system: ActorSystem) extends Extension { val DefaultPollTimeout: Duration = Duration(system.settings.config.getMilliseconds("akka.zeromq.poll-timeout"), TimeUnit.MILLISECONDS) val NewSocketTimeout: Timeout = Timeout(Duration(system.settings.config.getMilliseconds("akka.zeromq.new-socket-timeout"), TimeUnit.MILLISECONDS)) + val poll = + if (version.major >= 3) (poller: Poller, duration: Duration) ⇒ poller.poll(duration.toMillis) + else (poller: Poller, duration: Duration) ⇒ poller.poll(duration.toMicros) + /** * The version of the ZeroMQ library * @return a [[akka.zeromq.ZeroMQVersion]] From feb6798771f821b2a45568e4aa520423e9e63649 Mon Sep 17 00:00:00 2001 From: Josh Gao Date: Thu, 28 Jun 2012 14:31:53 -0700 Subject: [PATCH 10/39] Cache time unit for ZMQ poll duration --- .../scala/akka/zeromq/ConcurrentSocketActor.scala | 11 +++++++---- .../src/main/scala/akka/zeromq/ZeroMQExtension.scala | 4 +--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala b/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala index bc3a9c27df..1a54cbeb29 100644 --- a/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala +++ b/akka-zeromq/src/main/scala/akka/zeromq/ConcurrentSocketActor.scala @@ -172,12 +172,15 @@ private[zeromq] class ConcurrentSocketActor(params: Seq[SocketOption]) extends A // this is a “PollMsg=>Unit” which either polls or schedules Poll, depending on the sign of the timeout private val doPollTimeout = { + val ext = ZeroMQExtension(context.system) val fromConfig = params collectFirst { case PollTimeoutDuration(duration) ⇒ duration } - val duration = fromConfig getOrElse ZeroMQExtension(context.system).DefaultPollTimeout - if (duration > Duration.Zero) { (msg: PollMsg) ⇒ + val duration = (fromConfig getOrElse ext.DefaultPollTimeout) + if (duration > Duration.Zero) { // for positive timeout values, do poll (i.e. block this thread) - ZeroMQExtension(context.system).poll(poller, duration) - self ! msg + val pollLength = duration.toUnit(ext.pollTimeUnit).toLong + (msg: PollMsg) ⇒ + poller.poll(pollLength) + self ! msg } else { val d = -duration diff --git a/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala b/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala index 70ef399f53..b5a5e29310 100644 --- a/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala +++ b/akka-zeromq/src/main/scala/akka/zeromq/ZeroMQExtension.scala @@ -48,9 +48,7 @@ class ZeroMQExtension(system: ActorSystem) extends Extension { val DefaultPollTimeout: Duration = Duration(system.settings.config.getMilliseconds("akka.zeromq.poll-timeout"), TimeUnit.MILLISECONDS) val NewSocketTimeout: Timeout = Timeout(Duration(system.settings.config.getMilliseconds("akka.zeromq.new-socket-timeout"), TimeUnit.MILLISECONDS)) - val poll = - if (version.major >= 3) (poller: Poller, duration: Duration) ⇒ poller.poll(duration.toMillis) - else (poller: Poller, duration: Duration) ⇒ poller.poll(duration.toMicros) + val pollTimeUnit = if (version.major >= 3) TimeUnit.MILLISECONDS else TimeUnit.MICROSECONDS /** * The version of the ZeroMQ library From d7bed79730c7659c4bfb375808d4959fe1249862 Mon Sep 17 00:00:00 2001 From: Roland Date: Fri, 29 Jun 2012 14:42:11 +0200 Subject: [PATCH 11/39] add JavaTestKit, see #1952 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - it’s a completely new-written thing in pure Java, so that “protected” modifiers actually work and no ghost errors appear wrt. inheriting from PartialFunction or similar - it also features integration with the EventFilter - all closure-based constructs are modeled as inner classes of the JavaTestKit, where the user needs to override a single method which will then be executed --- .../src/main/scala/akka/event/Logging.scala | 1 + .../src/main/scala/akka/japi/JavaAPI.scala | 17 +- .../code/docs/testkit/TestKitDocTest.java | 350 +++++++++++++-- .../code/docs/testkit/TestKitSampleTest.java | 95 ++++ akka-docs/java/dispatchers.rst | 2 +- akka-docs/java/testing.rst | 414 ++++++------------ .../code/docs/testkit/TestkitDocSpec.scala | 11 + akka-docs/scala/dispatchers.rst | 2 +- akka-docs/scala/fsm.rst | 2 +- akka-docs/scala/testing.rst | 45 +- .../main/java/akka/testkit/JavaTestKit.java | 329 ++++++++++++++ .../scala/akka/testkit/TestActorRef.scala | 2 +- .../src/main/scala/akka/testkit/TestKit.scala | 48 +- .../akka/testkit/TestActorRefJavaCompile.java | 5 +- .../scala/akka/testkit/TestProbeSpec.scala | 6 +- 15 files changed, 964 insertions(+), 365 deletions(-) create mode 100644 akka-docs/java/code/docs/testkit/TestKitSampleTest.java create mode 100644 akka-testkit/src/main/java/akka/testkit/JavaTestKit.java diff --git a/akka-actor/src/main/scala/akka/event/Logging.scala b/akka-actor/src/main/scala/akka/event/Logging.scala index 0777d9aef1..111d5d5dd8 100644 --- a/akka-actor/src/main/scala/akka/event/Logging.scala +++ b/akka-actor/src/main/scala/akka/event/Logging.scala @@ -586,6 +586,7 @@ object Logging { /** Null Object used for errors without cause Throwable */ object NoCause extends NoStackTrace } + def noCause = Error.NoCause /** * For WARNING Logging diff --git a/akka-actor/src/main/scala/akka/japi/JavaAPI.scala b/akka-actor/src/main/scala/akka/japi/JavaAPI.scala index bc8c3c6ff9..d3123153da 100644 --- a/akka-actor/src/main/scala/akka/japi/JavaAPI.scala +++ b/akka-actor/src/main/scala/akka/japi/JavaAPI.scala @@ -46,7 +46,9 @@ trait Creator[T] { } object PurePartialFunction { - case object NoMatch extends RuntimeException with NoStackTrace + sealed abstract class NoMatchException extends RuntimeException with NoStackTrace + case object NoMatch extends NoMatchException + final def noMatch(): RuntimeException = NoMatch } /** @@ -86,7 +88,16 @@ abstract class PurePartialFunction[A, B] extends scala.runtime.AbstractFunction1 final def isDefinedAt(x: A): Boolean = try { apply(x, true); true } catch { case NoMatch ⇒ false } final def apply(x: A): B = try apply(x, false) catch { case NoMatch ⇒ throw new MatchError } - final def noMatch(): RuntimeException = NoMatch +} + +abstract class CachingPartialFunction[A, B <: AnyRef] extends scala.runtime.AbstractFunction1[A, B] with PartialFunction[A, B] { + import PurePartialFunction._ + + def `match`(x: A): B + + var cache: B = _ + final def isDefinedAt(x: A): Boolean = try { cache = `match`(x); true } catch { case NoMatch ⇒ cache = null.asInstanceOf[B]; false } + final def apply(x: A): B = cache } /** @@ -164,4 +175,6 @@ object Util { def manifest[T](clazz: Class[T]): Manifest[T] = Manifest.classType(clazz) def arrayToSeq[T](arr: Array[T]): Seq[T] = arr.toSeq + + def arrayToSeq(classes: Array[Class[_]]): Seq[Class[_]] = classes.toSeq } diff --git a/akka-docs/java/code/docs/testkit/TestKitDocTest.java b/akka-docs/java/code/docs/testkit/TestKitDocTest.java index d71f086104..a5f85019ea 100644 --- a/akka-docs/java/code/docs/testkit/TestKitDocTest.java +++ b/akka-docs/java/code/docs/testkit/TestKitDocTest.java @@ -1,22 +1,31 @@ -/* - * +/** + * Copyright (C) 2009-2012 Typesafe Inc. */ package docs.testkit; import static org.junit.Assert.*; import org.junit.AfterClass; +import org.junit.BeforeClass; import org.junit.Test; +import com.typesafe.config.ConfigFactory; +import com.typesafe.config.Config; + +import akka.actor.ActorKilledException; +import akka.actor.ActorRef; import akka.actor.ActorSystem; +import akka.actor.Kill; import akka.actor.Props; import akka.actor.UntypedActor; +import akka.actor.UntypedActorFactory; import akka.dispatch.Await; import akka.dispatch.Future; -import akka.japi.JAPI; -import akka.japi.PurePartialFunction; +import akka.testkit.CallingThreadDispatcher; +import akka.testkit.TestActor; +import akka.testkit.TestActor.AutoPilot; import akka.testkit.TestActorRef; -import akka.testkit.TestKit; +import akka.testkit.JavaTestKit; import akka.util.Duration; public class TestKitDocTest { @@ -37,8 +46,11 @@ public class TestKitDocTest { private static ActorSystem system; - public TestKitDocTest() { - system = ActorSystem.create(); + @BeforeClass + public static void setup() { + final Config config = ConfigFactory.parseString( + "akka.event-handlers = [akka.testkit.TestEventListener]"); + system = ActorSystem.create("demoSystem", config); } @AfterClass @@ -50,82 +62,338 @@ public class TestKitDocTest { @Test public void demonstrateTestActorRef() { final Props props = new Props(MyActor.class); - final TestActorRef ref = TestActorRef.apply(props, system); + final TestActorRef ref = TestActorRef.create(system, props, "testA"); final MyActor actor = ref.underlyingActor(); assertTrue(actor.testMe()); } //#test-actor-ref - //#test-behavior @Test public void demonstrateAsk() throws Exception { + //#test-behavior final Props props = new Props(MyActor.class); - final TestActorRef ref = TestActorRef.apply(props, system); + final TestActorRef ref = TestActorRef.create(system, props, "testB"); final Future future = akka.pattern.Patterns.ask(ref, "say42", 3000); assertTrue(future.isCompleted()); assertEquals(42, Await.result(future, Duration.Zero())); + //#test-behavior } - //#test-behavior - //#test-expecting-exceptions @Test public void demonstrateExceptions() { + //#test-expecting-exceptions final Props props = new Props(MyActor.class); - final TestActorRef ref = TestActorRef.apply(props, system); + final TestActorRef ref = TestActorRef.create(system, props, "myActor"); try { ref.receive(new Exception("expected")); fail("expected an exception to be thrown"); } catch (Exception e) { assertEquals("expected", e.getMessage()); } + //#test-expecting-exceptions } - //#test-expecting-exceptions - //#test-within @Test public void demonstrateWithin() { - new TestKit(system) {{ - testActor().tell(42); - new Within(Duration.parse("1 second")) { + //#test-within + new JavaTestKit(system) {{ + getRef().tell(42); + new Within(Duration.Zero(), Duration.parse("1 second")) { // do not put code outside this method, will run afterwards public void run() { assertEquals((Integer) 42, expectMsgClass(Integer.class)); } }; }}; + //#test-within } - //#test-within @Test - public void demonstrateExpectMsgPF() { - new TestKit(system) {{ - testActor().tell(42); - //#test-expect-pf - final String out = expectMsgPF(Duration.parse("1 second"), "fourty-two", - new PurePartialFunction() { - public String apply(Object in, boolean isCheck) { - if (Integer.valueOf(42).equals(in)) { + public void demonstrateExpectMsg() { + //#test-expectmsg + new JavaTestKit(system) {{ + getRef().tell(42); + final String out = new ExpectMsg("match hint") { + // do not put code outside this method, will run afterwards + protected String match(Object in) { + if (in instanceof Integer) { return "match"; } else { throw noMatch(); } } - } - ); + }.get(); // this extracts the received message assertEquals("match", out); - //#test-expect-pf - testActor().tell("world"); - //#test-expect-anyof - final String any = expectMsgAnyOf(remaining(), JAPI.seq("hello", "world")); - //#test-expect-anyof - assertEquals("world", any); - testActor().tell("world"); - //#test-expect-anyclassof - @SuppressWarnings("unchecked") - final String anyClass = expectMsgAnyClassOf(remaining(), JAPI.>seq(String.class)); - //#test-expect-anyclassof - assertEquals("world", any); + }}; + //#test-expectmsg + } + + @Test + public void demonstrateReceiveWhile() { + //#test-receivewhile + new JavaTestKit(system) {{ + getRef().tell(42); + getRef().tell(43); + getRef().tell("hello"); + final String[] out = + new ReceiveWhile(String.class, duration("1 second")) { + // do not put code outside this method, will run afterwards + protected String match(Object in) { + if (in instanceof Integer) { + return in.toString(); + } else { + throw noMatch(); + } + } + }.get(); // this extracts the received messages + assertArrayEquals(new String[] {"42", "43"}, out); + expectMsgEquals("hello"); + }}; + //#test-receivewhile + new JavaTestKit(system) {{ + //#test-receivewhile-full + new ReceiveWhile( // type of array to be created must match ... + String.class, // ... this class which is needed to that end + duration("100 millis"), // maximum collect time + duration("50 millis"), // maximum time between messages + 12 // maximum number of messages to collect + ) { + //#match-elided + protected String match(Object in) { + throw noMatch(); + } + //#match-elided + }; + //#test-receivewhile-full }}; } + @Test + public void demonstrateAwaitCond() { + //#test-awaitCond + new JavaTestKit(system) {{ + getRef().tell(42); + new AwaitCond( + duration("1 second"), // maximum wait time + duration("100 millis") // interval at which to check the condition + ) { + // do not put code outside this method, will run afterwards + protected boolean cond() { + // typically used to wait for something to start up + return msgAvailable(); + } + }; + }}; + //#test-awaitCond + } + + @Test + @SuppressWarnings("unchecked") // due to generic varargs + public void demonstrateExpect() { + new JavaTestKit(system) {{ + getRef().tell("hello"); + getRef().tell("hello"); + getRef().tell("hello"); + getRef().tell("world"); + getRef().tell(42); + getRef().tell(42); + //#test-expect + final String hello = expectMsgEquals("hello"); + final Object any = expectMsgAnyOf("hello", "world"); + final Object[] all = expectMsgAllOf("hello", "world"); + final int i = expectMsgClass(Integer.class); + final Number j = expectMsgAnyClassOf(Integer.class, Long.class); + expectNoMsg(); + //#test-expect + assertEquals("hello", hello); + assertEquals("hello", any); + assertEquals(42, i); + assertEquals(42, j); + assertArrayEquals(new String[] {"hello", "world"}, all); + }}; + } + + @Test + public void demonstrateIgnoreMsg() { + //#test-ignoreMsg + new JavaTestKit(system) {{ + // ignore all Strings + new IgnoreMsg() { + protected boolean ignore(Object msg) { + return msg instanceof String; + } + }; + getRef().tell("hello"); + getRef().tell(42); + expectMsgEquals(42); + // remove message filter + ignoreNoMsg(); + getRef().tell("hello"); + expectMsgEquals("hello"); + }}; + //#test-ignoreMsg + } + + @Test + public void demonstrateDilated() { + //#duration-dilation + new JavaTestKit(system) {{ + final Duration original = duration("1 second"); + final Duration stretched = dilated(original); + assertTrue("dilated", stretched.gteq(original)); + }}; + //#duration-dilation + } + + @Test + public void demonstrateProbe() { + //#test-probe + // simple actor which just forwards messages + class Forwarder extends UntypedActor { + final ActorRef target; + public Forwarder(ActorRef target) { + this.target = target; + } + public void onReceive(Object msg) { + target.forward(msg, getContext()); + } + } + + new JavaTestKit(system) {{ + // create a test probe + final JavaTestKit probe = new JavaTestKit(system); + + // create a forwarder, injecting the probe’s testActor + final Props props = new Props(new UntypedActorFactory() { + private static final long serialVersionUID = 8927158735963950216L; + public UntypedActor create() { + return new Forwarder(probe.getRef()); + } + }); + final ActorRef forwarder = system.actorOf(props, "forwarder"); + + // verify correct forwarding + forwarder.tell(42, getRef()); + probe.expectMsgEquals(42); + assertEquals(getRef(), probe.getLastSender()); + }}; + //#test-probe + } + + @Test + public void demonstrateSpecialProbe() { + //#test-special-probe + new JavaTestKit(system) {{ + class MyProbe extends JavaTestKit { + public MyProbe() { + super(system); + } + public void assertHello() { + expectMsgEquals("hello"); + } + } + + final MyProbe probe = new MyProbe(); + probe.getRef().tell("hello"); + probe.assertHello(); + }}; + //#test-special-probe + } + + @Test + public void demonstrateReply() { + //#test-probe-reply + new JavaTestKit(system) {{ + final JavaTestKit probe = new JavaTestKit(system); + probe.getRef().tell("hello", getRef()); + probe.expectMsgEquals("hello"); + probe.reply("world"); + expectMsgEquals("world"); + assertEquals(probe.getRef(), getLastSender()); + }}; + //#test-probe-reply + } + + @Test + public void demonstrateForward() { + //#test-probe-forward + new JavaTestKit(system) {{ + final JavaTestKit probe = new JavaTestKit(system); + probe.getRef().tell("hello", getRef()); + probe.expectMsgEquals("hello"); + probe.forward(getRef()); + expectMsgEquals("hello"); + assertEquals(getRef(), getLastSender()); + }}; + //#test-probe-forward + } + + @Test + public void demonstrateWithinProbe() { + try { + //#test-within-probe + new JavaTestKit(system) {{ + final JavaTestKit probe = new JavaTestKit(system); + new Within(duration("1 second")) { + public void run() { + probe.expectMsgEquals("hello"); + } + }; + }}; + //#test-within-probe + } catch (AssertionError e) { + // expected to fail + } + } + + @Test + public void demonstrateAutoPilot() { + //#test-auto-pilot + new JavaTestKit(system) {{ + final JavaTestKit probe = new JavaTestKit(system); + // install auto-pilot + probe.setAutoPilot(new TestActor.AutoPilot() { + public AutoPilot run(ActorRef sender, Object msg) { + sender.tell(msg); + return noAutoPilot(); + } + }); + // first one is replied to directly ... + probe.getRef().tell("hello", getRef()); + expectMsgEquals("hello"); + // ... but then the auto-pilot switched itself off + probe.getRef().tell("world", getRef()); + expectNoMsg(); + }}; + //#test-auto-pilot + } + + // only compilation + public void demonstrateCTD() { + //#calling-thread-dispatcher + system.actorOf( + new Props(MyActor.class) + .withDispatcher(CallingThreadDispatcher.Id())); + //#calling-thread-dispatcher + } + + @Test + public void demonstrateEventFilter() { + //#test-event-filter + new JavaTestKit(system) {{ + assertEquals("demoSystem", system.name()); + final ActorRef victim = system.actorOf(Props.empty(), "victim"); + + final int result = new EventFilter(ActorKilledException.class) { + protected Integer run() { + victim.tell(Kill.getInstance()); + return 42; + } + }.from("akka://demoSystem/user/victim").occurrences(1).exec(); + + assertEquals(42, result); + }}; + //#test-event-filter + } + } diff --git a/akka-docs/java/code/docs/testkit/TestKitSampleTest.java b/akka-docs/java/code/docs/testkit/TestKitSampleTest.java new file mode 100644 index 0000000000..ba235fad15 --- /dev/null +++ b/akka-docs/java/code/docs/testkit/TestKitSampleTest.java @@ -0,0 +1,95 @@ +/** + * Copyright (C) 2009-2012 Typesafe Inc. + */ +package docs.testkit; + +//#fullsample +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import akka.actor.ActorRef; +import akka.actor.ActorSystem; +import akka.actor.Props; +import akka.actor.UntypedActor; +import akka.testkit.JavaTestKit; +import akka.util.Duration; + +public class TestKitSampleTest { + + public static class SomeActor extends UntypedActor { + ActorRef target = null; + + public void onReceive(Object msg) { + + if (msg.equals("hello")) { + getSender().tell("world"); + if (target != null) target.forward(msg, getContext()); + + } else if (msg instanceof ActorRef) { + target = (ActorRef) msg; + getSender().tell("done"); + } + } + } + + static ActorSystem system; + + @BeforeClass + public static void setup() { + system = ActorSystem.create(); + } + + @AfterClass + public static void teardown() { + system.shutdown(); + } + + @Test + public void testIt() { + /* + * Wrap the whole test procedure within a testkit constructor + * if you want to receive actor replies or use Within(), etc. + */ + new JavaTestKit(system) {{ + final Props props = new Props(SomeActor.class); + final ActorRef subject = system.actorOf(props); + + // can also use JavaTestKit “from the outside” + final JavaTestKit probe = new JavaTestKit(system); + // “inject” the probe by passing it to the test subject + // like a real resource would be passed in production + subject.tell(probe.getRef(), getRef()); + // await the correct response + expectMsgEquals(duration("1 second"), "done"); + + // the run() method needs to finish within 3 seconds + new Within(duration("3 seconds")) { + protected void run() { + + subject.tell("hello", getRef()); + + // This is a demo: would normally use expectMsgEquals(). + // Wait time is bounded by 3-second deadline above. + new AwaitCond() { + protected boolean cond() { + return probe.msgAvailable(); + } + }; + + // response must have been enqueued to us before probe + expectMsgEquals(Duration.Zero(), "world"); + // check that the probe we injected earlier got the msg + probe.expectMsgEquals(Duration.Zero(), "hello"); + Assert.assertEquals(getRef(), probe.getLastSender()); + + // Will wait for the rest of the 3 seconds + expectNoMsg(); + } + }; + }}; + } + +} +//#fullsample diff --git a/akka-docs/java/dispatchers.rst b/akka-docs/java/dispatchers.rst index 2723883e9c..023424e687 100644 --- a/akka-docs/java/dispatchers.rst +++ b/akka-docs/java/dispatchers.rst @@ -92,7 +92,7 @@ There are 4 different types of message dispatchers: * CallingThreadDispatcher - This dispatcher runs invocations on the current thread only. This dispatcher does not create any new threads, - but it can be used from different threads concurrently for the same actor. See :ref:`TestCallingThreadDispatcherRef` + but it can be used from different threads concurrently for the same actor. See :ref:`Java-CallingThreadDispatcher` for details and restrictions. - Sharability: Unlimited diff --git a/akka-docs/java/testing.rst b/akka-docs/java/testing.rst index 6aa31ff633..bab663b355 100644 --- a/akka-docs/java/testing.rst +++ b/akka-docs/java/testing.rst @@ -141,170 +141,152 @@ Feel free to experiment with the possibilities, and if you find useful patterns, don't hesitate to let the Akka forums know about them! Who knows, common operations might even be worked into nice DSLs. -Integration Testing with :class:`TestKit` -========================================= +Integration Testing with :class:`JavaTestKit` +============================================= When you are reasonably sure that your actor's business logic is correct, the -next step is verifying that it works correctly within its intended environment -(if the individual actors are simple enough, possibly because they use the -:mod:`FSM` module, this might also be the first step). The definition of the -environment depends of course very much on the problem at hand and the level at -which you intend to test, ranging for functional/integration tests to full -system tests. The minimal setup consists of the test procedure, which provides -the desired stimuli, the actor under test, and an actor receiving replies. -Bigger systems replace the actor under test with a network of actors, apply -stimuli at varying injection points and arrange results to be sent from -different emission points, but the basic principle stays the same in that a -single procedure drives the test. +next step is verifying that it works correctly within its intended environment. +The definition of the environment depends of course very much on the problem at +hand and the level at which you intend to test, ranging for +functional/integration tests to full system tests. The minimal setup consists +of the test procedure, which provides the desired stimuli, the actor under +test, and an actor receiving replies. Bigger systems replace the actor under +test with a network of actors, apply stimuli at varying injection points and +arrange results to be sent from different emission points, but the basic +principle stays the same in that a single procedure drives the test. -The :class:`TestKit` class contains a collection of tools which makes this +The :class:`JavaTestKit` class contains a collection of tools which makes this common task easy. -.. includecode:: code/docs/testkit/PlainWordTest.java#plain-spec +.. includecode:: code/docs/testkit/TestKitSampleTest.java#fullsample -The :class:`TestKit` contains an actor named :obj:`testActor` which is the +The :class:`JavaTestKit` contains an actor named :obj:`testActor` which is the entry point for messages to be examined with the various ``expectMsg...`` -assertions detailed below. When mixing in the trait ``ImplicitSender`` this -test actor is implicitly used as sender reference when dispatching messages -from the test procedure. The :obj:`testActor` may also be passed to -other actors as usual, usually subscribing it as notification listener. There -is a whole set of examination methods, e.g. receiving all consecutive messages -matching certain criteria, receiving a whole sequence of fixed messages or -classes, receiving nothing for some time, etc. +assertions detailed below. The test actor’s reference is obtained using the +:meth:`getRef()` method as demonstrated above. The :obj:`testActor` may also +be passed to other actors as usual, usually subscribing it as notification +listener. There is a whole set of examination methods, e.g. receiving all +consecutive messages matching certain criteria, receiving a whole sequence of +fixed messages or classes, receiving nothing for some time, etc. -The ActorSystem passed in to the constructor of TestKit is accessible via the -:meth:`system()` method. Remember to shut down the actor system after the test -is finished (also in case of failure) so that all actors—including the test -actor—are stopped. +The ActorSystem passed in to the constructor of JavaTestKit is accessible via the +:meth:`getSystem()` method. + +.. note:: + + Remember to shut down the actor system after the test is finished (also in + case of failure) so that all actors—including the test actor—are stopped. Built-In Assertions ------------------- -The above mentioned :meth:`expectMsg` is not the only method for formulating -assertions concerning received messages. Here is the full list: +The above mentioned :meth:`expectMsgEquals` is not the only method for +formulating assertions concerning received messages, the full set is this: - * :meth:` T expectMsg(Duration d, T msg): T` +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-expect + +In these examples, the maximum durations you will find mentioned below are left +out, in which case they use the default value from configuration item +``akka.test.single-expect-default`` which itself defaults to 3 seconds (or they +obey the innermost enclosing :class:`Within` as detailed :ref:`below +`). The full signatures are: + + * :meth:`public  T expectMsgEquals(Duration max, T msg)` The given message object must be received within the specified time; the object will be returned. - * :meth:` T expectMsgPF(Duration d, PartialFunction pf)` + * :meth:`public Object expectMsgAnyOf(Duration max, Object... msg)` - Within the given time period, a message must be received and the given - partial function must be defined for that message; the result from applying - the partial function to the received message is returned. + An object must be received within the given time, and it must be equal + (compared with ``equals()``) to at least one of the passed reference + objects; the received object will be returned. - .. includecode:: code/docs/testkit/TestKitDocTest.java#test-expect-pf - - * :meth:` T expectMsgClass(Duration d, Class c)` - - An object which is an instance of the given :class:`Class` must be received - within the allotted time frame; the object will be returned. Note that this - does a conformance check; if you need the class to be equal, have a look at - :meth:`expectMsgAllClassOf` with a single given class argument. - - * :meth:` T expectMsgAnyOf(Duration d, Seq obj)` - - An object must be received within the given time, and it must be equal ( - compared with ``equals()``) to at least one of the passed reference objects; the - received object will be returned. - - .. includecode:: code/docs/testkit/TestKitDocTest.java#test-expect-anyof - - * :meth:` T expectMsgAnyClassOf(Duration d, Seq> classes)` - - An object must be received within the given time, and it must be an - instance of at least one of the supplied :class:`Class` objects; the - received object will be returned. - - .. includecode:: code/docs/testkit/TestKitDocTest.java#test-expect-anyclassof - - * :meth:`expectMsgAllOf[T](d: Duration, obj: T*): Seq[T]` + * :meth:`public Object[] expectMsgAllOf(Duration max, Object... msg)` A number of objects matching the size of the supplied object array must be received within the given time, and for each of the given objects there - must exist at least one among the received ones which equals (compared with - ``==``) it. The full sequence of received objects is returned. + must exist at least one among the received ones which equals it (compared + with ``equals()``). The full sequence of received objects is returned in + the order received. - * :meth:`expectMsgAllClassOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` + * :meth:`public  T expectMsgClass(Duration max, Class c)` - A number of objects matching the size of the supplied :class:`Class` array - must be received within the given time, and for each of the given classes - there must exist at least one among the received objects whose class equals - (compared with ``==``) it (this is *not* a conformance check). The full - sequence of received objects is returned. + An object which is an instance of the given :class:`Class` must be received + within the allotted time frame; the object will be returned. Note that this + does a conformance check, if you need the class to be equal you need to + verify that afterwards. - * :meth:`expectMsgAllConformingOf[T](d: Duration, c: Class[_ <: T]*): Seq[T]` + * :meth:`public  T expectMsgAnyClassOf(Duration max, Class... c)` - A number of objects matching the size of the supplied :class:`Class` array - must be received within the given time, and for each of the given classes - there must exist at least one among the received objects which is an - instance of this class. The full sequence of received objects is returned. + An object must be received within the given time, and it must be an + instance of at least one of the supplied :class:`Class` objects; the + received object will be returned. Note that this does a conformance check, + if you need the class to be equal you need to verify that afterwards. - * :meth:`expectNoMsg(d: Duration)` + .. note:: + + Because of a limitation in Java’s type system it may be necessary to add + ``@SuppressWarnings("unchecked")`` when using this method. + + * :meth:`public void expectNoMsg(Duration max)` No message must be received within the given time. This also fails if a message has been received before calling this method which has not been removed from the queue using one of the other methods. - * :meth:`receiveN(n: Int, d: Duration): Seq[AnyRef]` +For cases which require more refined conditions there are constructs which take +code blocks: - ``n`` messages must be received within the given time; the received - messages are returned. + * **ExpectMsg** - * :meth:`fishForMessage(max: Duration, hint: String)(pf: PartialFunction[Any, Boolean]): Any` + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-expectmsg - Keep receiving messages as long as the time is not used up and the partial - function matches and returns ``false``. Returns the message received for - which it returned ``true`` or throws an exception, which will include the - provided hint for easier debugging. + The :meth:`match(Object in)` method will be evaluated once a message has + been received within the allotted time (which may be given as constructor + argument). If it throws ``noMatch()`` (where it is sufficient to call that + method; the ``throw`` keyword is only needed in cases where the compiler + would otherwise complain about wrong return types—Java is lacking Scala’s + notion of a type which signifies “will not ever return normally”), then the + expectation fails with an :class:`AssertionError`, otherwise the matched + and possibly transformed object is stored for retrieval using the + :meth:`get()` method. -In addition to message reception assertions there are also methods which help -with message flows: + * **ReceiveWhile** - * :meth:`receiveOne(d: Duration): AnyRef` + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-receivewhile - Tries to receive one message for at most the given time interval and - returns ``null`` in case of failure. If the given Duration is zero, the - call is non-blocking (polling mode). + This construct works like ExpectMsg, but it continually collects messages + as long as they match the criteria, and it does not fail when a + non-matching one is encountered. Collecting messages also ends when the + time is up, when too much time passes between messages or when enough + messages have been received. - * :meth:`receiveWhile[T](max: Duration, idle: Duration, messages: Int)(pf: PartialFunction[Any, T]): Seq[T]` + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-receivewhile-full + :exclude: match-elided - Collect messages as long as + The need to specify the ``String`` result type twice results from the need + to create a correctly typed array and Java’s inability to infer the class’s + type argument. - * they are matching the given partial function - * the given time interval is not used up - * the next message is received within the idle timeout - * the number of messages has not yet reached the maximum + * **AwaitCond** - All collected messages are returned. The maximum duration defaults to the - time remaining in the innermost enclosing :ref:`within ` - block and the idle duration defaults to infinity (thereby disabling the - idle timeout feature). The number of expected messages defaults to - ``Int.MaxValue``, which effectively disables this limit. + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-awaitCond - * :meth:`awaitCond(p: => Boolean, max: Duration, interval: Duration)` + This general construct is not connected with the test kit’s message + reception, the embedded condition can compute the boolean result from + anything in scope. - Poll the given condition every :obj:`interval` until it returns ``true`` or - the :obj:`max` duration is used up. The interval defaults to 100 ms and the - maximum defaults to the time remaining in the innermost enclosing - :ref:`within ` block. +There are also cases where not all messages sent to the test kit are actually +relevant to the test, but removing them would mean altering the actors under +test. For this purpose it is possible to ignore certain messages: - * :meth:`ignoreMsg(pf: PartialFunction[AnyRef, Boolean])` + * **IgnoreMsg** - :meth:`ignoreNoMsg` + .. includecode:: code/docs/testkit/TestKitDocTest.java#test-ignoreMsg - The internal :obj:`testActor` contains a partial function for ignoring - messages: it will only enqueue messages which do not match the function or - for which the function returns ``false``. This function can be set and - reset using the methods given above; each invocation replaces the previous - function, they are not composed. - - This feature is useful e.g. when testing a logging system, where you want - to ignore regular messages and are only interested in your specific ones. - -Expecting Exceptions --------------------- +Expecting Log Messages +---------------------- Since an integration test does not allow to the internal processing of the participating actors, verifying expected exceptions cannot be done directly. @@ -313,9 +295,23 @@ handler with the :class:`TestEventListener` and using an :class:`EventFilter` allows assertions on log messages, including those which are generated by exceptions: -.. includecode:: code/docs/testkit/TestKitDocTest.java#event-filter +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-event-filter -.. _TestKit.within: +If a number of occurrences is specific—as demonstrated above—then ``exec()`` +will block until that number of matching messages have been received or the +timeout configured in ``akka.test.filter-leeway`` is used up (time starts +counting after the ``run()`` method returns). In case of a timeout the test +fails. + +.. note:: + + Be sure to exchange the default event handler with the + :class:`TestEventListener` in your ``application.conf`` to enable this + function:: + + akka.event-handlers = [akka.testkit.TestEventListener] + +.. _JavaTestKit.within: Timing Assertions ----------------- @@ -327,17 +323,13 @@ the positive or negative result must be obtained. Lower time limits need to be checked external to the examination, which is facilitated by a new construct for managing time constraints: -.. code-block:: scala +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-within - within([min, ]max) { - ... - } - -The block given to :meth:`within` must complete after a :ref:`Duration` which +The block in :meth:`Within.run()` must complete after a :ref:`Duration` which is between :obj:`min` and :obj:`max`, where the former defaults to zero. The deadline calculated by adding the :obj:`max` parameter to the block's start time is implicitly available within the block to all examination methods, if -you do not specify it, is is inherited from the innermost enclosing +you do not specify it, it is inherited from the innermost enclosing :meth:`within` block. It should be noted that if the last message-receiving assertion of the block is @@ -346,16 +338,10 @@ It should be noted that if the last message-receiving assertion of the block is latencies. This means that while individual contained assertions still use the maximum time bound, the overall block may take arbitrarily longer in this case. -.. includecode:: code/docs/testkit/TestKitDocTest.java#test-within - .. note:: All times are measured using ``System.nanoTime``, meaning that they describe - wall time, not CPU time. - -Ray Roestenburg has written a great article on using the TestKit: -``_. -His full example is also available :ref:`here `. + wall time, not CPU time or system time. Accounting for Slow Test Systems ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -371,33 +357,22 @@ in ``akka.testkit`` package object to add dilated function to :class:`Duration`. .. includecode:: code/docs/testkit/TestKitDocTest.java#duration-dilation -Resolving Conflicts with Implicit ActorRef ------------------------------------------- - -If you want the sender of messages inside your TestKit-based tests to be the ``testActor`` -simply mix in ``ÌmplicitSender`` into your test. - -.. includecode:: code/docs/testkit/PlainWordSpec.scala#implicit-sender - Using Multiple Probe Actors --------------------------- When the actors under test are supposed to send various messages to different destinations, it may be difficult distinguishing the message streams arriving -at the :obj:`testActor` when using the :class:`TestKit` as a mixin. Another -approach is to use it for creation of simple probe actors to be inserted in the -message flows. To make this more powerful and convenient, there is a concrete -implementation called :class:`TestProbe`. The functionality is best explained -using a small example: +at the :obj:`testActor` when using the :class:`JavaTestKit` as shown until now. +Another approach is to use it for creation of simple probe actors to be +inserted in the message flows. The functionality is best explained using a +small example: -.. includecode:: code/docs/testkit/TestKitDocTest.java - :include: imports-test-probe,my-double-echo,test-probe +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-probe -Here a the system under test is simulated by :class:`MyDoubleEcho`, which is -supposed to mirror its input to two outputs. Attaching two test probes enables -verification of the (simplistic) behavior. Another example would be two actors -A and B which collaborate by A sending messages to B. In order to verify this -message flow, a :class:`TestProbe` could be inserted as target of A, using the +This simple test verifies an equally simple Forwarder actor by injecting a +probe as the forwarder’s target. Another example would be two actors A and B +which collaborate by A sending messages to B. In order to verify this message +flow, a :class:`TestProbe` could be inserted as target of A, using the forwarding capabilities or auto-pilot described below to include a real B in the test setup. @@ -407,33 +382,28 @@ more concise and clear: .. includecode:: code/docs/testkit/TestKitDocTest.java :include: test-special-probe -You have complete flexibility here in mixing and matching the :class:`TestKit` -facilities with your own checks and choosing an intuitive name for it. In real -life your code will probably be a bit more complicated than the example given -above; just use the power! +You have complete flexibility here in mixing and matching the +:class:`JavaTestKit` facilities with your own checks and choosing an intuitive +name for it. In real life your code will probably be a bit more complicated +than the example given above; just use the power! Replying to Messages Received by Probes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The probes keep track of the communications channel for replies, if possible, -so they can also reply: +The probe stores the sender of the last dequeued message (i.e. after its +``expectMsg*`` reception), which may be retrieved using the +:meth:`getLastSender()` method. This information can also implicitly be used +for having the probe reply to the last received message: .. includecode:: code/docs/testkit/TestKitDocTest.java#test-probe-reply Forwarding Messages Received by Probes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Given a destination actor ``dest`` which in the nominal actor network would -receive a message from actor ``source``. If you arrange for the message to be -sent to a :class:`TestProbe` ``probe`` instead, you can make assertions -concerning volume and timing of the message flow while still keeping the -network functioning: +The probe can also forward a received message (i.e. after its ``expectMsg*`` +reception), retaining the original sender: -.. includecode:: code/docs/testkit/TestKitDocTest.java - :include: test-probe-forward-actors,test-probe-forward - -The ``dest`` actor will receive the same message invocation as if no test probe -had intervened. +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-probe-forward Auto-Pilot ^^^^^^^^^^ @@ -445,7 +415,7 @@ keep a test running and verify traces later you can also install an This code can be used to forward messages, e.g. in a chain ``A --> Probe --> B``, as long as a certain protocol is obeyed. -.. includecode:: ../../akka-testkit/src/test/scala/akka/testkit/TestProbeSpec.scala#autopilot +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-auto-pilot The :meth:`run` method must return the auto-pilot for the next message, wrapped in an :class:`Option`; setting it to :obj:`None` terminates the auto-pilot. @@ -455,25 +425,15 @@ Caution about Timing Assertions The behavior of :meth:`within` blocks when using test probes might be perceived as counter-intuitive: you need to remember that the nicely scoped deadline as -described :ref:`above ` is local to each probe. Hence, probes +described :ref:`above ` is local to each probe. Hence, probes do not react to each other's deadlines or to the deadline set in an enclosing -:class:`TestKit` instance:: +:class:`JavaTestKit` instance: - class SomeTest extends TestKit(_system: ActorSystem) with ImplicitSender { +.. includecode:: code/docs/testkit/TestKitDocTest.java#test-within-probe - val probe = TestProbe() +Here, the ``expectMsgEquals`` call will use the default timeout. - within(100 millis) { - probe.expectMsg("hallo") // Will hang forever! - } - } - -This test will hang indefinitely, because the :meth:`expectMsg` call does not -see any deadline. Currently, the only option is to use ``probe.within`` in the -above code to make it work; later versions may include lexically scoped -deadlines using implicit arguments. - -.. _TestCallingThreadDispatcherRef: +.. _Java-CallingThreadDispatcher: CallingThreadDispatcher ======================= @@ -572,7 +532,7 @@ has to offer: exception stack traces - Exclusion of certain classes of dead-lock scenarios -.. _actor.logging: +.. _actor.logging-java: Tracing Actor Invocations ========================= @@ -588,24 +548,6 @@ options: This is always on; in contrast to the other logging mechanisms, this logs at ``ERROR`` level. -* *Logging of message invocations on certain actors* - - This is enabled by a setting in the :ref:`configuration` — namely - ``akka.actor.debug.receive`` — which enables the :meth:`loggable` - statement to be applied to an actor’s :meth:`receive` function: - -.. includecode:: code/docs/testkit/TestKitDocTest.java#logging-receive - -. - If the abovementioned setting is not given in the :ref:`configuration`, this method will - pass through the given :class:`Receive` function unmodified, meaning that - there is no runtime cost unless actually enabled. - - The logging feature is coupled to this specific local mark-up because - enabling it uniformly on all actors is not usually what you need, and it - would lead to endless loops if it were applied to :class:`EventHandler` - listeners. - * *Logging of special messages* Actors handle certain special messages automatically, e.g. :obj:`Kill`, @@ -626,74 +568,10 @@ full logging of actor activities using this configuration fragment:: loglevel = DEBUG actor { debug { - receive = on autoreceive = on lifecycle = on } } } -Different Testing Frameworks -============================ - -Akka’s own test suite is written using `ScalaTest`_, -which also shines through in documentation examples. However, the TestKit and -its facilities do not depend on that framework, you can essentially use -whichever suits your development style best. - -This section contains a collection of known gotchas with some other frameworks, -which is by no means exhaustive and does not imply endorsement or special -support. - -When you need it to be a trait ------------------------------- - -If for some reason it is a problem to inherit from :class:`TestKit` due to it -being a concrete class instead of a trait, there’s :class:`TestKitBase`: - -.. includecode:: code/docs/testkit/TestKitDocTest.java - :include: test-kit-base - :exclude: put-your-test-code-here - -The ``implicit lazy val system`` must be declared exactly like that (you can of -course pass arguments to the actor system factory as needed) because trait -:class:`TestKitBase` needs the system during its construction. - -.. warning:: - - Use of the trait is discouraged because of potential issues with binary - backwards compatibility in the future, use at own risk. - -Specs2 ------- - -Some `Specs2`_ users have contributed examples of how to work around some clashes which may arise: - -* Mixing TestKit into :class:`org.specs2.mutable.Specification` results in a - name clash involving the ``end`` method (which is a private variable in - TestKit and an abstract method in Specification); if mixing in TestKit first, - the code may compile but might then fail at runtime. The work-around—which is - actually beneficial also for the third point—is to apply the TestKit together - with :class:`org.specs2.specification.Scope`. -* The Specification traits provide a :class:`Duration` DSL which uses partly - the same method names as :class:`akka.util.Duration`, resulting in ambiguous - implicits if ``akka.util.duration._`` is imported. There are two work-arounds: - - * either use the Specification variant of Duration and supply an implicit - conversion to the Akka Duration. This conversion is not supplied with the - Akka distribution because that would mean that our JAR files would dependon - Specs2, which is not justified by this little feature. - - * or mix :class:`org.specs2.time.NoTimeConversions` into the Specification. - -* Specifications are by default executed concurrently, which requires some care - when writing the tests or alternatively the ``sequential`` keyword. - -You can use the following two examples as guidelines: - -.. includecode:: code/docs/testkit/Specs2DemoSpec.scala - -.. includecode:: code/docs/testkit/Specs2DemoAcceptance.scala - - diff --git a/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala b/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala index fca85cf85f..1e42b2e8ac 100644 --- a/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala +++ b/akka-docs/scala/code/docs/testkit/TestkitDocSpec.scala @@ -275,4 +275,15 @@ class TestkitDocSpec extends AkkaSpec with DefaultTimeout with ImplicitSender { //#test-kit-base } + "demonstrate within() nesting" in { + intercept[AssertionError] { + //#test-within-probe + val probe = TestProbe() + within(1 second) { + probe.expectMsg("hello") + } + //#test-within-probe + } + } + } diff --git a/akka-docs/scala/dispatchers.rst b/akka-docs/scala/dispatchers.rst index cea9ee6e0a..f4ff26b573 100644 --- a/akka-docs/scala/dispatchers.rst +++ b/akka-docs/scala/dispatchers.rst @@ -93,7 +93,7 @@ There are 4 different types of message dispatchers: * CallingThreadDispatcher - This dispatcher runs invocations on the current thread only. This dispatcher does not create any new threads, - but it can be used from different threads concurrently for the same actor. See :ref:`TestCallingThreadDispatcherRef` + but it can be used from different threads concurrently for the same actor. See :ref:`Scala-CallingThreadDispatcher` for details and restrictions. - Sharability: Unlimited diff --git a/akka-docs/scala/fsm.rst b/akka-docs/scala/fsm.rst index e47fdaa055..b8fac5a6e3 100644 --- a/akka-docs/scala/fsm.rst +++ b/akka-docs/scala/fsm.rst @@ -424,7 +424,7 @@ This FSM will log at DEBUG level: * all state transitions Life cycle changes and special messages can be logged as described for -:ref:`Actors `. +:ref:`Actors `. Rolling Event Log ----------------- diff --git a/akka-docs/scala/testing.rst b/akka-docs/scala/testing.rst index 7a6415492d..ba05207975 100644 --- a/akka-docs/scala/testing.rst +++ b/akka-docs/scala/testing.rst @@ -317,8 +317,8 @@ with message flows: This feature is useful e.g. when testing a logging system, where you want to ignore regular messages and are only interested in your specific ones. -Expecting Exceptions --------------------- +Expecting Log Messages +---------------------- Since an integration test does not allow to the internal processing of the participating actors, verifying expected exceptions cannot be done directly. @@ -329,6 +329,20 @@ exceptions: .. includecode:: code/docs/testkit/TestkitDocSpec.scala#event-filter +If a number of occurrences is specific—as demonstrated above—then ``intercept`` +will block until that number of matching messages have been received or the +timeout configured in ``akka.test.filter-leeway`` is used up (time starts +counting after the passed-in block of code returns). In case of a timeout the +test fails. + +.. note:: + + Be sure to exchange the default event handler with the + :class:`TestEventListener` in your ``application.conf`` to enable this + function:: + + akka.event-handlers = [akka.testkit.TestEventListener] + .. _TestKit.within: Timing Assertions @@ -351,7 +365,7 @@ The block given to :meth:`within` must complete after a :ref:`Duration` which is between :obj:`min` and :obj:`max`, where the former defaults to zero. The deadline calculated by adding the :obj:`max` parameter to the block's start time is implicitly available within the block to all examination methods, if -you do not specify it, is is inherited from the innermost enclosing +you do not specify it, it is inherited from the innermost enclosing :meth:`within` block. It should be noted that if the last message-receiving assertion of the block is @@ -461,8 +475,9 @@ B``, as long as a certain protocol is obeyed. .. includecode:: ../../akka-testkit/src/test/scala/akka/testkit/TestProbeSpec.scala#autopilot -The :meth:`run` method must return the auto-pilot for the next message, wrapped -in an :class:`Option`; setting it to :obj:`None` terminates the auto-pilot. +The :meth:`run` method must return the auto-pilot for the next message, which +may be :class:`KeepRunning` to retain the current one or :class:`NoAutoPilot` +to switch it off. Caution about Timing Assertions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -471,23 +486,13 @@ The behavior of :meth:`within` blocks when using test probes might be perceived as counter-intuitive: you need to remember that the nicely scoped deadline as described :ref:`above ` is local to each probe. Hence, probes do not react to each other's deadlines or to the deadline set in an enclosing -:class:`TestKit` instance:: +:class:`TestKit` instance: - class SomeTest extends TestKit(_system: ActorSystem) with ImplicitSender { +.. includecode:: code/docs/testkit/TestkitDocSpec.scala#test-within-probe - val probe = TestProbe() +Here, the ``expectMsg`` call will use the default timeout. - within(100 millis) { - probe.expectMsg("hallo") // Will hang forever! - } - } - -This test will hang indefinitely, because the :meth:`expectMsg` call does not -see any deadline. Currently, the only option is to use ``probe.within`` in the -above code to make it work; later versions may include lexically scoped -deadlines using implicit arguments. - -.. _TestCallingThreadDispatcherRef: +.. _Scala-CallingThreadDispatcher: CallingThreadDispatcher ======================= @@ -586,7 +591,7 @@ has to offer: exception stack traces - Exclusion of certain classes of dead-lock scenarios -.. _actor.logging: +.. _actor.logging-scala: Tracing Actor Invocations ========================= diff --git a/akka-testkit/src/main/java/akka/testkit/JavaTestKit.java b/akka-testkit/src/main/java/akka/testkit/JavaTestKit.java new file mode 100644 index 0000000000..08846a4ad4 --- /dev/null +++ b/akka-testkit/src/main/java/akka/testkit/JavaTestKit.java @@ -0,0 +1,329 @@ +/** + * Copyright (C) 2009-2012 Typesafe Inc. + */ +package akka.testkit; + +import scala.runtime.AbstractFunction0; +import akka.actor.ActorRef; +import akka.actor.ActorSystem; +import akka.event.Logging; +import akka.event.Logging.LogEvent; +import akka.japi.PurePartialFunction; +import akka.japi.CachingPartialFunction; +import akka.japi.Util; +import akka.util.Duration; + +/** + * Java API for the TestProbe. Proper JavaDocs to come once JavaDoccing is implemented. + */ +public class JavaTestKit { + private final TestProbe p; + + public JavaTestKit(ActorSystem system) { + p = new TestProbe(system); + } + + public ActorRef getRef() { + return p.ref(); + } + + public ActorSystem getSystem() { + return p.system(); + } + + static public Duration duration(String s) { + return Duration.parse(s); + } + + public Duration dilated(Duration d) { + return d.mul(TestKitExtension.get(p.system()).TestTimeFactor()); + } + + public boolean msgAvailable() { + return p.msgAvailable(); + } + + public ActorRef getLastSender() { + return p.lastMessage().sender(); + } + + public void send(ActorRef actor, Object msg) { + actor.tell(msg, p.ref()); + } + + public void forward(ActorRef actor) { + actor.tell(p.lastMessage().msg(), p.lastMessage().sender()); + } + + public void reply(Object msg) { + p.lastMessage().sender().tell(msg, p.ref()); + } + + public Duration getRemainingTime() { + return p.remaining(); + } + + public Duration getRemainingTimeOr(Duration def) { + return p.remainingOr(def); + } + + public ActorRef watch(ActorRef ref) { + return p.watch(ref); + } + + public ActorRef unwatch(ActorRef ref) { + return p.unwatch(ref); + } + + public abstract class IgnoreMsg { + abstract protected boolean ignore(Object msg); + + public IgnoreMsg() { + p.ignoreMsg(new PurePartialFunction() { + public Boolean apply(Object in, boolean isCheck) { + return ignore(in); + } + }); + } + } + + public void ignoreNoMsg() { + p.ignoreNoMsg(); + } + + public void setAutoPilot(TestActor.AutoPilot pilot) { + p.setAutoPilot(pilot); + } + + public abstract class Within { + protected abstract void run(); + + public Within(Duration max) { + p.within(max, new AbstractFunction0() { + public Object apply() { + run(); + return null; + } + }); + } + + public Within(Duration min, Duration max) { + p.within(min, max, new AbstractFunction0() { + public Object apply() { + run(); + return null; + } + }); + } + } + + public abstract class AwaitCond { + protected abstract boolean cond(); + + public AwaitCond() { + this(Duration.Undefined(), p.awaitCond$default$3()); + } + + public AwaitCond(Duration max) { + this(max, p.awaitCond$default$3()); + } + + public AwaitCond(Duration max, Duration interval) { + p.awaitCond(new AbstractFunction0() { + public Object apply() { + return cond(); + } + }, max, interval); + } + } + + public abstract class ExpectMsg { + private final T result; + + public ExpectMsg(String hint) { + this(Duration.Undefined(), hint); + } + + public ExpectMsg(Duration max, String hint) { + final Object received = p.receiveOne(max); + try { + result = match(received); + } catch (PurePartialFunction.NoMatchException ex) { + throw new AssertionError("while expecting '" + hint + + "' received unexpected: " + received); + } + } + + abstract protected T match(Object msg); + + protected RuntimeException noMatch() { + throw PurePartialFunction.noMatch(); + } + + public T get() { + return result; + } + } + + public T expectMsgEquals(T msg) { + return p.expectMsg(msg); + } + + public T expectMsgEquals(Duration max, T msg) { + return p.expectMsg(max, msg); + } + + public T expectMsgClass(Class clazz) { + return p.expectMsgClass(clazz); + } + + public T expectMsgClass(Duration max, Class clazz) { + return p.expectMsgClass(max, clazz); + } + + public Object expectMsgAnyOf(Object... msgs) { + return p.expectMsgAnyOf(Util.arrayToSeq(msgs)); + } + + public Object expectMsgAnyOf(Duration max, Object... msgs) { + return p.expectMsgAnyOf(max, Util.arrayToSeq(msgs)); + } + + public Object[] expectMsgAllOf(Object... msgs) { + return (Object[]) p.expectMsgAllOf(Util.arrayToSeq(msgs)).toArray( + Util.manifest(Object.class)); + } + + public Object[] expectMsgAllOf(Duration max, Object... msgs) { + return (Object[]) p.expectMsgAllOf(max, Util.arrayToSeq(msgs)).toArray( + Util.manifest(Object.class)); + } + + @SuppressWarnings("unchecked") + public T expectMsgAnyClassOf(Class... classes) { + final Object result = p.expectMsgAnyClassOf(Util.arrayToSeq(classes)); + return (T) result; + } + + public Object expectMsgAnyClassOf(Duration max, Class... classes) { + return p.expectMsgAnyClassOf(max, Util.arrayToSeq(classes)); + } + + public void expectNoMsg() { + p.expectNoMsg(); + } + + public void expectNoMsg(Duration max) { + p.expectNoMsg(max); + } + + public abstract class ReceiveWhile { + abstract protected T match(Object msg); + + private Object results; + + public ReceiveWhile(Class clazz) { + this(clazz, Duration.Undefined()); + } + + public ReceiveWhile(Class clazz, Duration max) { + this(clazz, max, Duration.Inf(), Integer.MAX_VALUE); + } + + public ReceiveWhile(Class clazz, Duration max, int messages) { + this(clazz, max, Duration.Inf(), messages); + } + + @SuppressWarnings("unchecked") + public ReceiveWhile(Class clazz, Duration max, Duration idle, int messages) { + results = p.receiveWhile(max, idle, messages, + new CachingPartialFunction() { + public T match(Object msg) { + return ReceiveWhile.this.match(msg); + } + }).toArray(Util.manifest(clazz)); + } + + protected RuntimeException noMatch() { + throw PurePartialFunction.noMatch(); + } + + @SuppressWarnings("unchecked") + public T[] get() { + return (T[]) results; + } + } + + public abstract class EventFilter { + abstract protected T run(); + + private final Class clazz; + + private String source = null; + private String message = null; + private boolean pattern = false; + private boolean complete = false; + private int occurrences = Integer.MAX_VALUE; + private Class exceptionType = null; + + @SuppressWarnings("unchecked") + public EventFilter(Class clazz) { + if (Throwable.class.isAssignableFrom(clazz)) { + this.clazz = Logging.Error.class; + exceptionType = (Class) clazz; + } else if (Logging.LogEvent.class.isAssignableFrom(clazz)) { + this.clazz = (Class) clazz; + } else throw new IllegalArgumentException("supplied class must either be LogEvent or Throwable"); + } + + public T exec() { + akka.testkit.EventFilter filter; + if (clazz == Logging.Error.class) { + if (exceptionType == null) exceptionType = Logging.noCause().getClass(); + filter = new ErrorFilter(exceptionType, source, message, pattern, complete, occurrences); + } else if (clazz == Logging.Warning.class) { + filter = new WarningFilter(source, message, pattern, complete, occurrences); + } else if (clazz == Logging.Info.class) { + filter = new InfoFilter(source, message, pattern, complete, occurrences); + } else if (clazz == Logging.Debug.class) { + filter = new DebugFilter(source, message, pattern, complete, occurrences); + } else throw new IllegalArgumentException("unknown LogLevel " + clazz); + return filter.intercept(new AbstractFunction0() { + public T apply() { + return run(); + } + }, p.system()); + } + + public EventFilter message(String msg) { + message = msg; + pattern = false; + complete = true; + return this; + } + + public EventFilter startsWith(String msg) { + message = msg; + pattern = false; + complete = false; + return this; + } + + public EventFilter matches(String regex) { + message = regex; + pattern = true; + return this; + } + + public EventFilter from(String source) { + this.source = source; + return this; + } + + public EventFilter occurrences(int number) { + occurrences = number; + return this; + } + } + +} diff --git a/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala b/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala index b65d836f22..6c33f5f60b 100644 --- a/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala +++ b/akka-testkit/src/main/scala/akka/testkit/TestActorRef.scala @@ -136,5 +136,5 @@ object TestActorRef { /** * Java API */ - def create(props: Props, name: String, system: ActorSystem) = apply(props, name)(system) + def create[T <: Actor](system: ActorSystem, props: Props, name: String): TestActorRef[T] = apply(props, name)(system) } diff --git a/akka-testkit/src/main/scala/akka/testkit/TestKit.scala b/akka-testkit/src/main/scala/akka/testkit/TestKit.scala index c292cc238a..ed9d6c5415 100644 --- a/akka-testkit/src/main/scala/akka/testkit/TestKit.scala +++ b/akka-testkit/src/main/scala/akka/testkit/TestKit.scala @@ -14,12 +14,23 @@ import akka.actor.ActorSystem import akka.util.Timeout import akka.util.BoxedType import scala.annotation.varargs +import akka.japi.PurePartialFunction object TestActor { type Ignore = Option[PartialFunction[AnyRef, Boolean]] - trait AutoPilot { - def run(sender: ActorRef, msg: Any): Option[AutoPilot] + abstract class AutoPilot { + def run(sender: ActorRef, msg: Any): AutoPilot + def noAutoPilot: AutoPilot = NoAutoPilot + def keepRunning: AutoPilot = KeepRunning + } + + case object NoAutoPilot extends AutoPilot { + def run(sender: ActorRef, msg: Any): AutoPilot = this + } + + case object KeepRunning extends AutoPilot { + def run(sender: ActorRef, msg: Any): AutoPilot = sys.error("must not call") } case class SetIgnore(i: Ignore) @@ -43,15 +54,18 @@ class TestActor(queue: BlockingDeque[TestActor.Message]) extends Actor { var ignore: Ignore = None - var autopilot: Option[AutoPilot] = None + var autopilot: AutoPilot = NoAutoPilot def receive = { case SetIgnore(ign) ⇒ ignore = ign case x @ Watch(ref) ⇒ context.watch(ref); queue.offerLast(RealMessage(x, self)) case x @ UnWatch(ref) ⇒ context.unwatch(ref); queue.offerLast(RealMessage(x, self)) - case SetAutoPilot(pilot) ⇒ autopilot = Some(pilot) + case SetAutoPilot(pilot) ⇒ autopilot = pilot case x: AnyRef ⇒ - autopilot = autopilot.flatMap(_.run(sender, x)) + autopilot = autopilot.run(sender, x) match { + case KeepRunning ⇒ autopilot + case other ⇒ other + } val observe = ignore map (ignoreFunc ⇒ if (ignoreFunc isDefinedAt x) !ignoreFunc(x) else true) getOrElse true if (observe) queue.offerLast(RealMessage(x, sender)) } @@ -126,20 +140,20 @@ trait TestKitBase { * Have the testActor watch someone (i.e. `context.watch(...)`). Waits until * the Watch message is received back using expectMsg. */ - def watch(ref: ActorRef) { + def watch(ref: ActorRef): ActorRef = { val msg = TestActor.Watch(ref) testActor ! msg - expectMsg(msg) + expectMsg(msg).ref } /** * Have the testActor stop watching someone (i.e. `context.unwatch(...)`). Waits until * the Watch message is received back using expectMsg. */ - def unwatch(ref: ActorRef) { + def unwatch(ref: ActorRef): ActorRef = { val msg = TestActor.UnWatch(ref) testActor ! msg - expectMsg(msg) + expectMsg(msg).ref } /** @@ -242,22 +256,6 @@ trait TestKitBase { */ def within[T](max: Duration)(f: ⇒ T): T = within(0 seconds, max)(f) - /** - * Java API for within(): - * - * {{{ - * new Within(Duration.parse("3 seconds")) { - * public void run() { - * // your test code here - * } - * } - * }}} - */ - abstract class Within(max: Duration) { - def run(): Unit - within(max)(run()) - } - /** * Same as `expectMsg(remaining, obj)`, but correctly treating the timeFactor. */ diff --git a/akka-testkit/src/test/java/akka/testkit/TestActorRefJavaCompile.java b/akka-testkit/src/test/java/akka/testkit/TestActorRefJavaCompile.java index 5c13557854..ecf2cd6e51 100644 --- a/akka-testkit/src/test/java/akka/testkit/TestActorRefJavaCompile.java +++ b/akka-testkit/src/test/java/akka/testkit/TestActorRefJavaCompile.java @@ -4,13 +4,14 @@ package akka.testkit; -import org.junit.Test; +import akka.actor.Actor; import akka.actor.Props; public class TestActorRefJavaCompile { public void shouldBeAbleToCompileWhenUsingApply() { //Just a dummy call to make sure it compiles - TestActorRef ref = TestActorRef.apply(new Props(), null); + TestActorRef ref = TestActorRef.apply(new Props(), null); + ref.toString(); } } \ No newline at end of file diff --git a/akka-testkit/src/test/scala/akka/testkit/TestProbeSpec.scala b/akka-testkit/src/test/scala/akka/testkit/TestProbeSpec.scala index 14fbee8bc1..6e764c96dc 100644 --- a/akka-testkit/src/test/scala/akka/testkit/TestProbeSpec.scala +++ b/akka-testkit/src/test/scala/akka/testkit/TestProbeSpec.scala @@ -44,10 +44,10 @@ class TestProbeSpec extends AkkaSpec with DefaultTimeout { //#autopilot val probe = TestProbe() probe.setAutoPilot(new TestActor.AutoPilot { - def run(sender: ActorRef, msg: Any): Option[TestActor.AutoPilot] = + def run(sender: ActorRef, msg: Any): TestActor.AutoPilot = msg match { - case "stop" ⇒ None - case x ⇒ testActor.tell(x, sender); Some(this) + case "stop" ⇒ TestActor.NoAutoPilot + case x ⇒ testActor.tell(x, sender); TestActor.KeepRunning } }) //#autopilot From 3c8a15e2dfa06c9f1eca8e67c9514f69fe54c5b8 Mon Sep 17 00:00:00 2001 From: Roland Kuhn Date: Mon, 2 Jul 2012 10:35:13 +0300 Subject: [PATCH 12/39] Add license information for junit-interface --- project/AkkaBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/AkkaBuild.scala b/project/AkkaBuild.scala index 87725d1d5b..5cb311eab1 100644 --- a/project/AkkaBuild.scala +++ b/project/AkkaBuild.scala @@ -515,7 +515,7 @@ object Dependency { val scalatest = "org.scalatest" % "scalatest_2.9.1" % V.Scalatest % "test" // ApacheV2 val scalacheck = "org.scala-tools.testing" % "scalacheck_2.9.1" % "1.9" % "test" // New BSD val specs2 = "org.specs2" % "specs2_2.9.1" % "1.9" % "test" // Modified BSD / ApacheV2 - val junitIntf = "com.novocode" % "junit-interface" % "0.8" % "test" + val junitIntf = "com.novocode" % "junit-interface" % "0.8" % "test" // MIT } } From c09caebe8a497f38309deb29b475626ec9988673 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Fri, 29 Jun 2012 16:24:24 +0200 Subject: [PATCH 13/39] Small refactoring of cluster actors * Separate actor for heartbeats, so they are more isolated from gossip messages * Configuration property for dispatcher to use for the cluster actors --- .../src/main/resources/reference.conf | 4 ++ .../src/main/scala/akka/cluster/Cluster.scala | 67 +++++++++++++------ .../scala/akka/cluster/ClusterSettings.scala | 5 ++ .../akka/cluster/ClusterConfigSpec.scala | 2 + 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/akka-cluster/src/main/resources/reference.conf b/akka-cluster/src/main/resources/reference.conf index b60b91ec43..613f6320d8 100644 --- a/akka-cluster/src/main/resources/reference.conf +++ b/akka-cluster/src/main/resources/reference.conf @@ -52,6 +52,10 @@ akka { # of the cluster within this deadline. join-timeout = 60s + # The id of the dispatcher to use for cluster actors. If not specified default dispatcher is used. + # If specified you need to define the settings of the actual dispatcher. + use-dispatcher = "" + # Gossip to random node with newer or older state information, if any with some # this probability. Otherwise Gossip to any random live node. # Probability value is between 0.0 and 1.0. 0.0 means never, 1.0 means always. diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 44c646ebe8..b1bf73eddb 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -372,7 +372,8 @@ case class Heartbeat(from: Address) extends ClusterMessage * INTERNAL API. * * Manages routing of the different cluster commands. - * Instantiated as a single instance for each Cluster - e.g. commands are serialized to Cluster message after message. + * Instantiated as a single instance for each Cluster - e.g. commands are serialized + * to Cluster message after message, but concurrent with other types of messages. */ private[cluster] final class ClusterCommandDaemon(cluster: Cluster) extends Actor { import ClusterUserAction._ @@ -398,7 +399,7 @@ private[cluster] final class ClusterCommandDaemon(cluster: Cluster) extends Acto if (seedRoutees.isEmpty) { cluster join cluster.selfAddress } else { - implicit val within = Timeout(cluster.clusterSettings.SeedNodeTimeout) + implicit val within = Timeout(cluster.settings.SeedNodeTimeout) val seedRouter = context.actorOf( Props.empty.withRouter(ScatterGatherFirstCompletedRouter( routees = seedRoutees, within = within.duration))) @@ -415,18 +416,35 @@ private[cluster] final class ClusterCommandDaemon(cluster: Cluster) extends Acto /** * INTERNAL API. * - * Pooled and routed with N number of configurable instances. - * Concurrent access to Cluster. + * Receives Gossip messages and delegates to Cluster. + * Instantiated as a single instance for each Cluster - e.g. gossips are serialized + * to Cluster message after message, but concurrent with other types of messages. */ -private[cluster] final class ClusterGossipDaemon(cluster: Cluster) extends Actor { - val log = Logging(context.system, this) +private[cluster] final class ClusterGossipDaemon(cluster: Cluster) extends Actor with ActorLogging { def receive = { - case Heartbeat(from) ⇒ cluster.receiveHeartbeat(from) case GossipEnvelope(from, gossip) ⇒ cluster.receiveGossip(from, gossip) } - override def unhandled(unknown: Any) = log.error("[/system/cluster/gossip] can not respond to messages - received [{}]", unknown) + override def unhandled(unknown: Any) = log.error("[{}] can not respond to messages - received [{}]", + self.path, unknown) +} + +/** + * INTERNAL API. + * + * Receives Heartbeat messages and delegates to Cluster. + * Instantiated as a single instance for each Cluster - e.g. heartbeats are serialized + * to Cluster message after message, but concurrent with other types of messages. + */ +private[cluster] final class ClusterHeartbeatDaemon(cluster: Cluster) extends Actor with ActorLogging { + + def receive = { + case Heartbeat(from) ⇒ cluster.receiveHeartbeat(from) + } + + override def unhandled(unknown: Any) = log.error("[{}] can not respond to messages - received [{}]", + self.path, unknown) } /** @@ -434,17 +452,22 @@ private[cluster] final class ClusterGossipDaemon(cluster: Cluster) extends Actor * * Supervisor managing the different Cluster daemons. */ -private[cluster] final class ClusterDaemonSupervisor(cluster: Cluster) extends Actor { - val log = Logging(context.system, this) +private[cluster] final class ClusterDaemonSupervisor(cluster: Cluster) extends Actor with ActorLogging { - private val commands = context.actorOf(Props(new ClusterCommandDaemon(cluster)), "commands") - private val gossip = context.actorOf( - Props(new ClusterGossipDaemon(cluster)).withRouter( - RoundRobinRouter(cluster.clusterSettings.NrOfGossipDaemons)), "gossip") + val configuredDispatcher = cluster.settings.UseDispatcher + private val commands = context.actorOf(Props(new ClusterCommandDaemon(cluster)). + withDispatcher(configuredDispatcher), name = "commands") + private val gossip = context.actorOf(Props(new ClusterGossipDaemon(cluster)). + withDispatcher(configuredDispatcher). + withRouter(RoundRobinRouter(cluster.settings.NrOfGossipDaemons)), + name = "gossip") + private val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(cluster)). + withDispatcher(configuredDispatcher), name = "heartbeat") def receive = Actor.emptyBehavior - override def unhandled(unknown: Any): Unit = log.error("[/system/cluster] can not respond to messages - received [{}]", unknown) + override def unhandled(unknown: Any): Unit = log.error("[{}] can not respond to messages - received [{}]", + self.path, unknown) } /** @@ -526,8 +549,8 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) private val remote: RemoteActorRefProvider = system.provider.asInstanceOf[RemoteActorRefProvider] val remoteSettings = new RemoteSettings(system.settings.config, system.name) - val clusterSettings = new ClusterSettings(system.settings.config, system.name) - import clusterSettings._ + val settings = new ClusterSettings(system.settings.config, system.name) + import settings._ val selfAddress = remote.transport.address private val selfHeartbeat = Heartbeat(selfAddress) @@ -548,7 +571,8 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) // create supervisor for daemons under path "/system/cluster" private val clusterDaemons = { - val createChild = CreateChild(Props(new ClusterDaemonSupervisor(this)), "cluster") + val createChild = CreateChild(Props(new ClusterDaemonSupervisor(this)). + withDispatcher(UseDispatcher), name = "cluster") Await.result(system.systemGuardian ? createChild, defaultTimeout.duration) match { case a: ActorRef ⇒ a case e: Exception ⇒ throw e @@ -1138,7 +1162,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) val beatTo = localState.latestGossip.members.toSeq.map(_.address) ++ localState.joinInProgress.keys for (address ← beatTo; if address != selfAddress) { - val connection = clusterGossipConnectionFor(address) + val connection = clusterHeartbeatConnectionFor(address) log.debug("Cluster Node [{}] - Heartbeat to [{}]", selfAddress, connection) connection ! selfHeartbeat } @@ -1460,6 +1484,11 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) */ private def clusterGossipConnectionFor(address: Address): ActorRef = system.actorFor(RootActorPath(address) / "system" / "cluster" / "gossip") + /** + * Looks up and returns the remote cluster heartbeat connection for the specific address. + */ + private def clusterHeartbeatConnectionFor(address: Address): ActorRef = system.actorFor(RootActorPath(address) / "system" / "cluster" / "heartbeat") + /** * Gets the addresses of a all the 'deputy' nodes - excluding this node if part of the group. */ diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala index 6e4cbc4e60..2a63f32e83 100644 --- a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala @@ -10,6 +10,7 @@ import akka.ConfigurationException import scala.collection.JavaConverters._ import akka.actor.Address import akka.actor.AddressFromURIString +import akka.dispatch.Dispatchers class ClusterSettings(val config: Config, val systemName: String) { import config._ @@ -36,6 +37,10 @@ class ClusterSettings(val config: Config, val systemName: String) { final val AutoJoin: Boolean = getBoolean("akka.cluster.auto-join") final val AutoDown: Boolean = getBoolean("akka.cluster.auto-down") final val JoinTimeout: Duration = Duration(getMilliseconds("akka.cluster.join-timeout"), MILLISECONDS) + final val UseDispatcher: String = getString("akka.cluster.use-dispatcher") match { + case "" ⇒ Dispatchers.DefaultDispatcherId + case id ⇒ id + } final val GossipDifferentViewProbability: Double = getDouble("akka.cluster.gossip-different-view-probability") final val SchedulerTickDuration: Duration = Duration(getMilliseconds("akka.cluster.scheduler.tick-duration"), MILLISECONDS) final val SchedulerTicksPerWheel: Int = getInt("akka.cluster.scheduler.ticks-per-wheel") diff --git a/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala b/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala index 07671c6164..d146e22982 100644 --- a/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala +++ b/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala @@ -7,6 +7,7 @@ package akka.cluster import akka.testkit.AkkaSpec import akka.util.duration._ import akka.util.Duration +import akka.dispatch.Dispatchers @org.junit.runner.RunWith(classOf[org.scalatest.junit.JUnitRunner]) class ClusterConfigSpec extends AkkaSpec { @@ -32,6 +33,7 @@ class ClusterConfigSpec extends AkkaSpec { NrOfGossipDaemons must be(4) AutoJoin must be(true) AutoDown must be(true) + UseDispatcher must be(Dispatchers.DefaultDispatcherId) GossipDifferentViewProbability must be(0.8 plusOrMinus 0.0001) SchedulerTickDuration must be(33 millis) SchedulerTicksPerWheel must be(512) From e5979bc31c6f4b998a5f7c60d9b1686353ff653f Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Mon, 2 Jul 2012 12:30:49 +0200 Subject: [PATCH 14/39] Gossip merge in large cluster, #2290 * Trying to simultaneously resolving conflicts at several nodes creates new conflicts. Therefore the leader resolves conflicts to limit divergence. To avoid overload there is also a configurable rate limit of how many conflicts that are handled by second. * Netty blocks when sending to broken connections. ClusterHeartbeatSender actor isolates sending to different nodes by using child workers for each target address and thereby reduce the risk of irregular heartbeats to healty nodes due to broken connections to other nodes. --- .../src/main/resources/reference.conf | 12 + .../src/main/scala/akka/cluster/Cluster.scala | 328 ++++++++++++++---- .../scala/akka/cluster/ClusterSettings.scala | 7 + .../scala/akka/cluster/LargeClusterSpec.scala | 25 +- .../akka/cluster/ClusterConfigSpec.scala | 5 + 5 files changed, 304 insertions(+), 73 deletions(-) diff --git a/akka-cluster/src/main/resources/reference.conf b/akka-cluster/src/main/resources/reference.conf index 613f6320d8..14536ae8b4 100644 --- a/akka-cluster/src/main/resources/reference.conf +++ b/akka-cluster/src/main/resources/reference.conf @@ -61,6 +61,10 @@ akka { # Probability value is between 0.0 and 1.0. 0.0 means never, 1.0 means always. gossip-different-view-probability = 0.8 + # Limit number of merge conflicts per second that are handled. If the limit is + # exceeded the conflicting gossip messages are dropped and will reappear later. + max-gossip-merge-rate = 5.0 + failure-detector { # defines the failure detector threshold @@ -97,5 +101,13 @@ akka { tick-duration = 33ms ticks-per-wheel = 512 } + + # Netty blocks when sending to broken connections, and this circuit breaker + # is used to reduce connect attempts to broken connections. + send-circuit-breaker { + max-failures = 3 + call-timeout = 2 s + reset-timeout = 30 s + } } } diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index b1bf73eddb..0b856edd17 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -29,6 +29,7 @@ import scala.annotation.tailrec import scala.collection.immutable.{ Map, SortedSet } import scala.collection.GenTraversableOnce import java.util.concurrent.atomic.AtomicLong +import java.security.MessageDigest /** * Interface for membership change listener. @@ -200,7 +201,13 @@ object Member { /** * Envelope adding a sender address to the gossip. */ -case class GossipEnvelope(from: Address, gossip: Gossip) extends ClusterMessage +case class GossipEnvelope(from: Address, gossip: Gossip, conversation: Boolean = true) extends ClusterMessage + +/** + * When conflicting versions of received and local [[akka.cluster.Gossip]] is detected + * it's forwarded to the leader for conflict resolution. + */ +case class GossipMergeConflict(a: GossipEnvelope, b: GossipEnvelope) extends ClusterMessage /** * Defines the current status of a cluster member node @@ -354,6 +361,11 @@ case class Gossip( Gossip(GossipOverview(mergedSeen, mergedUnreachable), mergedMembers, mergedMeta, mergedVClock) } + def isLeader(address: Address): Boolean = + members.nonEmpty && (address == members.head.address) + + def leader: Option[Address] = members.headOption.map(_.address) + override def toString = "Gossip(" + "overview = " + overview + @@ -368,6 +380,15 @@ case class Gossip( */ case class Heartbeat(from: Address) extends ClusterMessage +/** + * INTERNAL API. + * + * Command to [akka.cluster.ClusterHeartbeatSender]], which will send [[akka.cluster.Heartbeat]] + * to the other node. + * Local only, no need to serialize. + */ +private[cluster] case class SendHeartbeat(heartbeatMsg: Heartbeat, to: Address, deadline: Deadline) + /** * INTERNAL API. * @@ -423,7 +444,8 @@ private[cluster] final class ClusterCommandDaemon(cluster: Cluster) extends Acto private[cluster] final class ClusterGossipDaemon(cluster: Cluster) extends Actor with ActorLogging { def receive = { - case GossipEnvelope(from, gossip) ⇒ cluster.receiveGossip(from, gossip) + case msg: GossipEnvelope ⇒ cluster.receiveGossip(msg) + case msg: GossipMergeConflict ⇒ cluster.receiveGossipMerge(msg) } override def unhandled(unknown: Any) = log.error("[{}] can not respond to messages - received [{}]", @@ -447,6 +469,85 @@ private[cluster] final class ClusterHeartbeatDaemon(cluster: Cluster) extends Ac self.path, unknown) } +/* + * This actor is responsible for sending the heartbeat messages to + * other nodes. Netty blocks when sending to broken connections. This actor + * isolates sending to different nodes by using child workers for each target + * address and thereby reduce the risk of irregular heartbeats to healty + * nodes due to broken connections to other nodes. + */ +private[cluster] final class ClusterHeartbeatSender(cluster: Cluster) extends Actor with ActorLogging { + + /** + * Looks up and returns the remote cluster heartbeat connection for the specific address. + */ + def clusterHeartbeatConnectionFor(address: Address): ActorRef = + context.system.actorFor(RootActorPath(address) / "system" / "cluster" / "heartbeat") + + val digester = MessageDigest.getInstance("MD5") + + /** + * Child name is MD5 hash of the address + */ + def hash(name: String): String = { + digester update name.getBytes("UTF-8") + digester.digest.map { h ⇒ "%02x".format(0xFF & h) }.mkString + } + + def receive = { + case msg @ SendHeartbeat(from, to, deadline) ⇒ + val workerName = hash(to.toString) + val worker = context.actorFor(workerName) match { + case notFound if notFound.isTerminated ⇒ + context.actorOf(Props(new ClusterHeartbeatSenderWorker( + cluster.settings.SendCircuitBreakerSettings, clusterHeartbeatConnectionFor(to))), workerName) + case child ⇒ child + } + worker ! msg + } + +} + +/** + * Responsible for sending [[akka.cluster.Heartbeat]] to one specific address. + * + * Netty blocks when sending to broken connections, and this actor uses + * a configurable circuit breaker to reduce connect attempts to broken + * connections. + * + * @see ClusterHeartbeatSender + */ +private[cluster] final class ClusterHeartbeatSenderWorker( + cbSettings: CircuitBreakerSettings, toRef: ActorRef) + extends Actor with ActorLogging { + + val breaker = CircuitBreaker(context.system.scheduler, + cbSettings.maxFailures, cbSettings.callTimeout, cbSettings.resetTimeout). + onHalfOpen(log.debug("CircuitBreaker Half-Open for: [{}]", toRef)). + onOpen(log.debug("CircuitBreaker Open for [{}]", toRef)). + onClose(log.debug("CircuitBreaker Closed for [{}]", toRef)) + + context.setReceiveTimeout(30 seconds) + + def receive = { + case SendHeartbeat(heartbeatMsg, _, deadline) ⇒ + if (!deadline.isOverdue) { + // the CircuitBreaker will measure elapsed time and open if too many long calls + try breaker.withSyncCircuitBreaker { + log.debug("Cluster Node [{}] - Heartbeat to [{}]", heartbeatMsg.from, toRef) + toRef ! heartbeatMsg + if (deadline.isOverdue) log.debug("Sending heartbeat to [{}] took longer than expected", toRef) + } catch { case e: CircuitBreakerOpenException ⇒ /* skip sending heartbeat to broken connection */ } + + // make sure it will cleanup when not used any more + context.setReceiveTimeout(30 seconds) + } + + case ReceiveTimeout ⇒ context.stop(self) // cleanup when not used + + } +} + /** * INTERNAL API. * @@ -455,14 +556,16 @@ private[cluster] final class ClusterHeartbeatDaemon(cluster: Cluster) extends Ac private[cluster] final class ClusterDaemonSupervisor(cluster: Cluster) extends Actor with ActorLogging { val configuredDispatcher = cluster.settings.UseDispatcher - private val commands = context.actorOf(Props(new ClusterCommandDaemon(cluster)). + val commands = context.actorOf(Props(new ClusterCommandDaemon(cluster)). withDispatcher(configuredDispatcher), name = "commands") - private val gossip = context.actorOf(Props(new ClusterGossipDaemon(cluster)). + val gossip = context.actorOf(Props(new ClusterGossipDaemon(cluster)). withDispatcher(configuredDispatcher). withRouter(RoundRobinRouter(cluster.settings.NrOfGossipDaemons)), name = "gossip") - private val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(cluster)). + val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(cluster)). withDispatcher(configuredDispatcher), name = "heartbeat") + val heartbeatSender = context.actorOf(Props(new ClusterHeartbeatSender(cluster)). + withDispatcher(configuredDispatcher), name = "heartbeatSender") def receive = Actor.emptyBehavior @@ -699,15 +802,15 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) /** * Is this node the leader? */ - def isLeader: Boolean = { - val members = latestGossip.members - members.nonEmpty && (selfAddress == members.head.address) - } + def isLeader: Boolean = latestGossip.isLeader(selfAddress) /** * Get the address of the current leader. */ - def leader: Address = latestGossip.members.head.address + def leader: Address = latestGossip.leader match { + case Some(x) ⇒ x + case None ⇒ throw new IllegalStateException("There is no leader in this cluster") + } /** * Is this node a singleton cluster? @@ -862,7 +965,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) if (!state.compareAndSet(localState, newState)) joining(node) // recur if we failed update else { - log.info("Cluster Node [{}] - Node [{}] is JOINING", selfAddress, node) + log.debug("Cluster Node [{}] - Node [{}] is JOINING", selfAddress, node) // treat join as initial heartbeat, so that it becomes unavailable if nothing more happens if (node != selfAddress) { failureDetector heartbeat node @@ -986,74 +1089,164 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) // Can be removed when gossip has been optimized private val _receivedGossipCount = new AtomicLong + /** * INTERNAL API. */ private[cluster] def receivedGossipCount: Long = _receivedGossipCount.get + /** + * INTERNAL API. + */ + private[cluster] def mergeCount: Long = _mergeCount.get + + // Can be removed when gossip has been optimized + private val _mergeCount = new AtomicLong + + /** + * INTERNAL API. + */ + private[cluster] def mergeDetectedCount: Long = _mergeDetectedCount.get + + // Can be removed when gossip has been optimized + private val _mergeDetectedCount = new AtomicLong + + private val _mergeConflictCount = new AtomicLong + private def mergeRate(count: Long): Double = (count * 1000.0) / GossipInterval.toMillis + + /** + * INTERNAL API. + * + * When conflicting versions of received and local [[akka.cluster.Gossip]] is detected + * it's forwarded to the leader for conflict resolution. Trying to simultaneously + * resolving conflicts at several nodes creates new conflicts. Therefore the leader resolves + * conflicts to limit divergence. To avoid overload there is also a configurable rate + * limit of how many conflicts that are handled by second. If the limit is + * exceeded the conflicting gossip messages are dropped and will reappear later. + */ + private[cluster] def receiveGossipMerge(merge: GossipMergeConflict): Unit = { + val count = _mergeConflictCount.incrementAndGet + val rate = mergeRate(count) + if (rate <= MaxGossipMergeRate) { + receiveGossip(merge.a.copy(conversation = false)) + receiveGossip(merge.b.copy(conversation = false)) + + // use one-way gossip from leader to reduce load of leader + def sendBack(to: Address): Unit = { + if (to != selfAddress && !latestGossip.overview.unreachable.exists(_.address == to)) + oneWayGossipTo(to) + } + + sendBack(merge.a.from) + sendBack(merge.b.from) + + } else { + log.debug("Dropping gossip merge conflict due to rate [{}] / s ", rate) + } + } + /** * INTERNAL API. * * Receive new gossip. */ @tailrec - final private[cluster] def receiveGossip(from: Address, remoteGossip: Gossip): Unit = { + final private[cluster] def receiveGossip(envelope: GossipEnvelope): Unit = { + val from = envelope.from + val remoteGossip = envelope.gossip val localState = state.get val localGossip = localState.latestGossip - if (!localGossip.overview.isNonDownUnreachable(from)) { + if (remoteGossip.overview.unreachable.exists(_.address == selfAddress)) { + // FIXME how should we handle this situation? + log.debug("Received gossip with self as unreachable, from [{}]", from) - val winningGossip = - if (remoteGossip.version <> localGossip.version) { - // concurrent - val mergedGossip = remoteGossip merge localGossip - val versionedMergedGossip = mergedGossip :+ vclockNode + } else if (!localGossip.overview.isNonDownUnreachable(from)) { - versionedMergedGossip + // leader handles merge conflicts, or when they have different views of how is leader + val handleMerge = localGossip.leader == Some(selfAddress) || localGossip.leader != remoteGossip.leader + val conflict = remoteGossip.version <> localGossip.version - } else if (remoteGossip.version < localGossip.version) { - // local gossip is newer - localGossip + if (conflict && !handleMerge) { + // delegate merge resolution to leader to reduce number of simultaneous resolves, + // which will result in new conflicts + log.debug("Merge conflict [{}] detected [{}] <> [{}]", _mergeDetectedCount.incrementAndGet, selfAddress, from) + + val count = _mergeConflictCount.incrementAndGet + val rate = mergeRate(count) + if (rate <= MaxGossipMergeRate) { + val leaderConnection = clusterGossipConnectionFor(localGossip.leader.get) + leaderConnection ! GossipMergeConflict(GossipEnvelope(selfAddress, localGossip), envelope) } else { - // remote gossip is newer - remoteGossip + log.debug("Skipping gossip merge conflict due to rate [{}] / s ", rate) } - val newJoinInProgress = - if (localState.joinInProgress.isEmpty) localState.joinInProgress - else localState.joinInProgress -- - winningGossip.members.map(_.address) -- - winningGossip.overview.unreachable.map(_.address) + } else { - val newState = localState copy ( - latestGossip = winningGossip seen selfAddress, - joinInProgress = newJoinInProgress) + val winningGossip = - // for all new joining nodes we optimistically remove them from the failure detector, since if we wait until - // we have won the CAS, then the node might be picked up by the reapUnreachableMembers task and moved to - // unreachable before we can remove the node from the failure detector - (newState.latestGossip.members -- localState.latestGossip.members).filter(_.status == Joining).foreach { - case node ⇒ failureDetector.remove(node.address) - } + if (conflict) { + // conflicting versions, merge, and new version + val mergedGossip = remoteGossip merge localGossip + mergedGossip :+ vclockNode - // if we won the race then update else try again - if (!state.compareAndSet(localState, newState)) receiveGossip(from, remoteGossip) // recur if we fail the update - else { - log.debug("Cluster Node [{}] - Receiving gossip from [{}]", selfAddress, from) + } else if (remoteGossip.version < localGossip.version) { + // local gossip is newer + localGossip - if ((winningGossip ne localGossip) && (winningGossip ne remoteGossip)) - log.debug( - """Couldn't establish a causal relationship between "remote" gossip and "local" gossip - Remote[{}] - Local[{}] - merged them into [{}]""", - remoteGossip, localGossip, winningGossip) + } else if (!remoteGossip.members.exists(_.address == selfAddress)) { + // FIXME This is a very strange. It can happen when many nodes join at the same time. + // It's not detected as an ordinary version conflict <> + // If we don't handle this situation there will be IllegalArgumentException when marking this as seen + // merge, and new version + val mergedGossip = remoteGossip merge (localGossip :+ Member(selfAddress, Joining)) + mergedGossip :+ vclockNode - _receivedGossipCount.incrementAndGet() - notifyMembershipChangeListeners(localState, newState) + } else { + // remote gossip is newer + remoteGossip - if ((winningGossip ne remoteGossip) || (newState.latestGossip ne remoteGossip)) { - // send back gossip to sender when sender had different view, i.e. merge, or sender had - // older or sender had newer - gossipTo(from) + } + + val newJoinInProgress = + if (localState.joinInProgress.isEmpty) localState.joinInProgress + else localState.joinInProgress -- + winningGossip.members.map(_.address) -- + winningGossip.overview.unreachable.map(_.address) + + val newState = localState copy ( + latestGossip = winningGossip seen selfAddress, + joinInProgress = newJoinInProgress) + + // for all new joining nodes we optimistically remove them from the failure detector, since if we wait until + // we have won the CAS, then the node might be picked up by the reapUnreachableMembers task and moved to + // unreachable before we can remove the node from the failure detector + (newState.latestGossip.members -- localState.latestGossip.members).filter(_.status == Joining).foreach { + case node ⇒ failureDetector.remove(node.address) + } + + // if we won the race then update else try again + if (!state.compareAndSet(localState, newState)) receiveGossip(envelope) // recur if we fail the update + else { + log.debug("Cluster Node [{}] - Receiving gossip from [{}]", selfAddress, from) + + if (conflict) { + _mergeCount.incrementAndGet + log.debug( + """Couldn't establish a causal relationship between "remote" gossip and "local" gossip - Remote[{}] - Local[{}] - merged them into [{}]""", + remoteGossip, localGossip, winningGossip) + } + + _receivedGossipCount.incrementAndGet() + notifyMembershipChangeListeners(localState, newState) + + if (envelope.conversation && + (conflict || (winningGossip ne remoteGossip) || (newState.latestGossip ne remoteGossip))) { + // send back gossip to sender when sender had different view, i.e. merge, or sender had + // older or sender had newer + gossipTo(from) + } } } } @@ -1074,10 +1267,19 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) * * Gossips latest gossip to an address. */ - private[cluster] def gossipTo(address: Address): Unit = { + private[cluster] def gossipTo(address: Address): Unit = + gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = true)) + + /** + * INTERNAL API. + */ + private[cluster] def oneWayGossipTo(address: Address): Unit = + gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = false)) + + private def gossipTo(address: Address, gossipMsg: GossipEnvelope): Unit = if (address != selfAddress) { val connection = clusterGossipConnectionFor(address) log.debug("Cluster Node [{}] - Gossiping to [{}]", selfAddress, connection) - connection ! GossipEnvelope(selfAddress, latestGossip) + connection ! gossipMsg } /** @@ -1112,6 +1314,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) */ private[cluster] def gossip(): Unit = { val localState = state.get + _mergeConflictCount.set(0) log.debug("Cluster Node [{}] - Initiating new round of gossip", selfAddress) @@ -1161,11 +1364,9 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) val beatTo = localState.latestGossip.members.toSeq.map(_.address) ++ localState.joinInProgress.keys - for (address ← beatTo; if address != selfAddress) { - val connection = clusterHeartbeatConnectionFor(address) - log.debug("Cluster Node [{}] - Heartbeat to [{}]", selfAddress, connection) - connection ! selfHeartbeat - } + val deadline = Deadline.now + HeartbeatInterval + for (address ← beatTo; if address != selfAddress) + clusterHeartbeatSender ! SendHeartbeat(selfHeartbeat, address, deadline) } /** @@ -1187,7 +1388,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) val newlyDetectedUnreachableMembers = localMembers filterNot { member ⇒ failureDetector.isAvailable(member.address) } - if (newlyDetectedUnreachableMembers.nonEmpty) { // we have newly detected members marked as unavailable + if (newlyDetectedUnreachableMembers.nonEmpty) { val newMembers = localMembers -- newlyDetectedUnreachableMembers val newUnreachableMembers = localUnreachableMembers ++ newlyDetectedUnreachableMembers @@ -1204,7 +1405,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) // if we won the race then update else try again if (!state.compareAndSet(localState, newState)) reapUnreachableMembers() // recur else { - log.info("Cluster Node [{}] - Marking node(s) as UNREACHABLE [{}]", selfAddress, newlyDetectedUnreachableMembers.mkString(", ")) + log.error("Cluster Node [{}] - Marking node(s) as UNREACHABLE [{}]", selfAddress, newlyDetectedUnreachableMembers.mkString(", ")) notifyMembershipChangeListeners(localState, newState) } @@ -1484,10 +1685,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) */ private def clusterGossipConnectionFor(address: Address): ActorRef = system.actorFor(RootActorPath(address) / "system" / "cluster" / "gossip") - /** - * Looks up and returns the remote cluster heartbeat connection for the specific address. - */ - private def clusterHeartbeatConnectionFor(address: Address): ActorRef = system.actorFor(RootActorPath(address) / "system" / "cluster" / "heartbeat") + private def clusterHeartbeatSender: ActorRef = system.actorFor(clusterDaemons.path / "heartbeatSender") /** * Gets the addresses of a all the 'deputy' nodes - excluding this node if part of the group. diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala index 2a63f32e83..e9d95de446 100644 --- a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala @@ -42,6 +42,13 @@ class ClusterSettings(val config: Config, val systemName: String) { case id ⇒ id } final val GossipDifferentViewProbability: Double = getDouble("akka.cluster.gossip-different-view-probability") + final val MaxGossipMergeRate: Double = getDouble("akka.cluster.max-gossip-merge-rate") final val SchedulerTickDuration: Duration = Duration(getMilliseconds("akka.cluster.scheduler.tick-duration"), MILLISECONDS) final val SchedulerTicksPerWheel: Int = getInt("akka.cluster.scheduler.ticks-per-wheel") + final val SendCircuitBreakerSettings: CircuitBreakerSettings = CircuitBreakerSettings( + maxFailures = getInt("akka.cluster.send-circuit-breaker.max-failures"), + callTimeout = Duration(getMilliseconds("akka.cluster.send-circuit-breaker.call-timeout"), MILLISECONDS), + resetTimeout = Duration(getMilliseconds("akka.cluster.send-circuit-breaker.reset-timeout"), MILLISECONDS)) } + +case class CircuitBreakerSettings(maxFailures: Int, callTimeout: Duration, resetTimeout: Duration) \ No newline at end of file diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala index e3dc7719c1..014983426f 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala @@ -36,12 +36,21 @@ object LargeClusterMultiJvmSpec extends MultiNodeConfig { akka.cluster { gossip-interval = 500 ms auto-join = off - failure-detector.threshold = 4 + nr-of-gossip-daemons = 2 + failure-detector.acceptable-heartbeat-pause = 10s } akka.loglevel = INFO - akka.actor.default-dispatcher.fork-join-executor.parallelism-max = 2 + akka.actor.default-dispatcher.fork-join-executor { + # when using nodes-per-datacenter=10 we need some extra + # threads to keep up with netty connect blocking + parallelism-min = 13 + parallelism-max = 13 + } akka.scheduler.tick-duration = 33 ms - akka.remote.netty.execution-pool-size = 0 + akka.remote.netty.execution-pool-size = 4 + #akka.remote.netty.reconnection-time-window = 1s + akka.remote.netty.backoff-timeout = 500ms + akka.remote.netty.connection-timeout = 500ms # don't use testconductor transport in this test, especially not # when using use-dispatcher-for-io @@ -244,12 +253,11 @@ abstract class LargeClusterSpec } } - // FIXME sometimes this fails, FD marks nodes from other than second-datacenter as unavailable - "detect failure and auto-down crashed nodes in second-datacenter" taggedAs LongRunningTest ignore { + "detect failure and auto-down crashed nodes in second-datacenter" taggedAs LongRunningTest in { val unreachableNodes = nodesPerDatacenter val liveNodes = nodesPerDatacenter * 4 - within(20.seconds + expectedMaxDuration(liveNodes)) { + within(30.seconds + (3.seconds * liveNodes)) { val startGossipCounts = Map.empty[Cluster, Long] ++ systems.map(sys ⇒ (Cluster(sys) -> Cluster(sys).receivedGossipCount)) def gossipCount(c: Cluster): Long = c.receivedGossipCount - startGossipCounts(c) @@ -278,10 +286,11 @@ abstract class LargeClusterSpec runOn(firstDatacenter, thirdDatacenter, fourthDatacenter, fifthDatacenter) { Await.ready(latch, remaining) awaitCond(systems.forall(Cluster(_).convergence.isDefined)) + val mergeCount = systems.map(sys ⇒ Cluster(sys).mergeCount).sum val counts = systems.map(sys ⇒ gossipCount(Cluster(sys))) val formattedStats = "mean=%s min=%s max=%s".format(counts.sum / nodesPerDatacenter, counts.min, counts.max) - log.info("Convergence of [{}] nodes reached after failure, it took [{}], received [{}] gossip messages per node", - liveNodes, tookMillis, formattedStats) + log.info("Convergence of [{}] nodes reached after failure, it took [{}], received [{}] gossip messages per node, merged [{}] times", + liveNodes, tookMillis, formattedStats, mergeCount) } enterBarrier("after-6") diff --git a/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala b/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala index d146e22982..88f04b9d7d 100644 --- a/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala +++ b/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala @@ -35,8 +35,13 @@ class ClusterConfigSpec extends AkkaSpec { AutoDown must be(true) UseDispatcher must be(Dispatchers.DefaultDispatcherId) GossipDifferentViewProbability must be(0.8 plusOrMinus 0.0001) + MaxGossipMergeRate must be(5.0 plusOrMinus 0.0001) SchedulerTickDuration must be(33 millis) SchedulerTicksPerWheel must be(512) + SendCircuitBreakerSettings must be(CircuitBreakerSettings( + maxFailures = 3, + callTimeout = 2 seconds, + resetTimeout = 30 seconds)) } } } From 804c182cc0060ddf1a19704abda814f66e36215c Mon Sep 17 00:00:00 2001 From: Roland Date: Wed, 4 Jul 2012 10:29:44 +0200 Subject: [PATCH 15/39] incorporate review: add docs, see #1952 --- .../src/main/scala/akka/japi/JavaAPI.scala | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/akka-actor/src/main/scala/akka/japi/JavaAPI.scala b/akka-actor/src/main/scala/akka/japi/JavaAPI.scala index d3123153da..b5a53d1fe5 100644 --- a/akka-actor/src/main/scala/akka/japi/JavaAPI.scala +++ b/akka-actor/src/main/scala/akka/japi/JavaAPI.scala @@ -58,6 +58,10 @@ object PurePartialFunction { * `isCheck == true` and the latter to `isCheck == false` for those cases where * this is important to know. * + * Failure to match is signaled by throwing `noMatch()`, i.e. not returning + * normally (the exception used in this case is pre-allocated, hence not + * that expensive). + * * {{{ * new PurePartialFunction() { * public String apply(Object in, boolean isCheck) { @@ -74,7 +78,9 @@ object PurePartialFunction { * The typical use of partial functions from Akka looks like the following: * * {{{ - * if (pf.isDefinedAt(x)) pf.apply(x) + * if (pf.isDefinedAt(x)) { + * pf.apply(x); + * } * }}} * * i.e. it will first call `PurePartialFunction.apply(x, true)` and if that @@ -90,6 +96,15 @@ abstract class PurePartialFunction[A, B] extends scala.runtime.AbstractFunction1 final def apply(x: A): B = try apply(x, false) catch { case NoMatch ⇒ throw new MatchError } } +/** + * This is a specialized variant of PartialFunction which is only + * applicable if you know that `isDefinedAt(x)` is always called before + * `apply(x)`—with the same `x` of course. + * + * `match(x)` will be called for `isDefinedAt(x)` only, and its semantics + * are the same as for [[akka.japi.PurePartialFunction]] (apart from the + * missing because unneeded boolean argument). + */ abstract class CachingPartialFunction[A, B <: AnyRef] extends scala.runtime.AbstractFunction1[A, B] with PartialFunction[A, B] { import PurePartialFunction._ From 1ccb9fe7ecc8bc33d3daff06249ea348b59c8c43 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Wed, 4 Jul 2012 11:58:51 +0200 Subject: [PATCH 16/39] Note about URLEncode instead of MD5, see #2290 --- akka-cluster/src/main/scala/akka/cluster/Cluster.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 0b856edd17..227b3a6fa9 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -487,16 +487,17 @@ private[cluster] final class ClusterHeartbeatSender(cluster: Cluster) extends Ac val digester = MessageDigest.getInstance("MD5") /** - * Child name is MD5 hash of the address + * Child name is MD5 hash of the address. + * FIXME Change to URLEncode when ticket #2123 has been fixed */ - def hash(name: String): String = { + def encodeChildName(name: String): String = { digester update name.getBytes("UTF-8") digester.digest.map { h ⇒ "%02x".format(0xFF & h) }.mkString } def receive = { case msg @ SendHeartbeat(from, to, deadline) ⇒ - val workerName = hash(to.toString) + val workerName = encodeChildName(to.toString) val worker = context.actorFor(workerName) match { case notFound if notFound.isTerminated ⇒ context.actorOf(Props(new ClusterHeartbeatSenderWorker( From c708d2ad8a9e57f3b9dbf5ef4f51bb72966659e7 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Wed, 4 Jul 2012 11:37:56 +0200 Subject: [PATCH 17/39] First step in refactoring of cluster internals to actors, see #2311 * Move clustering code to ClusterCore actor * More will be done, comitting this for early review --- .../src/main/resources/reference.conf | 7 +- .../src/main/scala/akka/cluster/Cluster.scala | 1556 ++++++++--------- .../scala/akka/cluster/ClusterSettings.scala | 3 +- .../scala/akka/cluster/FixedRateTask.scala | 11 +- .../scala/akka/cluster/JoinSeedNodeSpec.scala | 4 + .../scala/akka/cluster/LargeClusterSpec.scala | 16 +- .../akka/cluster/MultiNodeClusterSpec.scala | 1 + .../scala/akka/cluster/TransitionSpec.scala | 35 +- .../UnreachableNodeRejoinsClusterSpec.scala | 28 +- .../akka/cluster/ClusterConfigSpec.scala | 2 +- .../test/scala/akka/cluster/ClusterSpec.scala | 27 +- 11 files changed, 820 insertions(+), 870 deletions(-) diff --git a/akka-cluster/src/main/resources/reference.conf b/akka-cluster/src/main/resources/reference.conf index e9c26fe811..3ce4eca363 100644 --- a/akka-cluster/src/main/resources/reference.conf +++ b/akka-cluster/src/main/resources/reference.conf @@ -27,9 +27,6 @@ akka { # network partition. auto-down = off - # the number of gossip daemon actors - nr-of-gossip-daemons = 4 - # the number of deputy nodes (the nodes responsible for breaking network partitions) nr-of-deputy-nodes = 3 @@ -48,6 +45,10 @@ akka { # how often should the node move nodes, marked as unreachable by the failure detector, out of the membership ring? unreachable-nodes-reaper-interval = 1s + # How often the current state (Gossip) should be published for reading from the outside. + # A value of 0 s can be used to always publish the state, when it happens. + publish-state-interval = 1s + # A joining node stops sending heartbeats to the node to join if it hasn't become member # of the cluster within this deadline. join-timeout = 60s diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 227b3a6fa9..54dfd585ce 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -57,12 +57,35 @@ sealed trait ClusterMessage extends Serializable */ object ClusterUserAction { + /** + * Command to initiate join another node (represented by 'address'). + * Join will be sent to the other node. + */ + case class JoinTo(address: Address) extends ClusterMessage + /** * Command to join the cluster. Sent when a node (represented by 'address') * wants to join another node (the receiver). */ case class Join(address: Address) extends ClusterMessage + /** + * Command to leave the cluster. + */ + case class Leave(address: Address) extends ClusterMessage + + /** + * Command to mark node as temporary down. + */ + case class Down(address: Address) extends ClusterMessage + +} + +/** + * INTERNAL API + */ +object InternalClusterAction { + /** * Start message of the process to join one of the seed nodes. * The node sends `InitJoin` to all seed nodes, which replies @@ -82,14 +105,32 @@ object ClusterUserAction { case class InitJoinAck(address: Address) extends ClusterMessage /** - * Command to leave the cluster. + * + * Command to [akka.cluster.ClusterHeartbeatSender]], which will send [[akka.cluster.Heartbeat]] + * to the other node. + * Local only, no need to serialize. */ - case class Leave(address: Address) extends ClusterMessage + case class SendHeartbeat(heartbeatMsg: Heartbeat, to: Address, deadline: Deadline) + + case object GossipTick + + case object HeartbeatTick + + case object ReapUnreachableTick + + case object LeaderActionsTick + + case object PublishStateTick + + case class SendClusterMessage(to: Address, msg: ClusterMessage) + + case class SendGossipTo(address: Address) + + case object GetClusterCoreRef + + case class Ping(timestamp: Long = System.currentTimeMillis) extends ClusterMessage + case class Pong(ping: Ping, timestamp: Long = System.currentTimeMillis) extends ClusterMessage - /** - * Command to mark node as temporary down. - */ - case class Down(address: Address) extends ClusterMessage } /** @@ -285,7 +326,7 @@ object Gossip { */ case class Gossip( overview: GossipOverview = GossipOverview(), - members: SortedSet[Member], // sorted set of members with their status, sorted by address + members: SortedSet[Member] = Gossip.emptyMembers, // sorted set of members with their status, sorted by address meta: Map[String, Array[Byte]] = Map.empty, version: VectorClock = VectorClock()) // vector clock version extends ClusterMessage // is a serializable cluster message @@ -361,11 +402,57 @@ case class Gossip( Gossip(GossipOverview(mergedSeen, mergedUnreachable), mergedMembers, mergedMeta, mergedVClock) } + /** + * Checks if we have a cluster convergence. If there are any unreachable nodes then we can't have a convergence - + * waiting for user to act (issuing DOWN) or leader to act (issuing DOWN through auto-down). + * + * @returns Some(convergedGossip) if convergence have been reached and None if not + */ + def convergence: Boolean = { + val unreachable = overview.unreachable + val seen = overview.seen + + // First check that: + // 1. we don't have any members that are unreachable, or + // 2. all unreachable members in the set have status DOWN + // Else we can't continue to check for convergence + // When that is done we check that all the entries in the 'seen' table have the same vector clock version + // and that all members exists in seen table + val hasUnreachable = unreachable.nonEmpty && unreachable.exists { _.status != Down } + val allMembersInSeen = members.forall(m ⇒ seen.contains(m.address)) + + if (hasUnreachable) false + else if (!allMembersInSeen) true + else seen.values.toSet.size == 1 + } + def isLeader(address: Address): Boolean = members.nonEmpty && (address == members.head.address) def leader: Option[Address] = members.headOption.map(_.address) + def isSingletonCluster: Boolean = members.size == 1 + + /** + * Returns true if the node is UP or JOINING. + */ + def isAvailable(address: Address): Boolean = !isUnavailable(address) + + def isUnavailable(address: Address): Boolean = { + val isUnreachable = overview.unreachable exists { _.address == address } + val hasUnavailableMemberStatus = members exists { m ⇒ (m.address == address) && m.status.isUnavailable } + isUnreachable || hasUnavailableMemberStatus + } + + def member(address: Address): Member = { + members.find(_.address == address) + .getOrElse { + overview.unreachable + .find(_.address == address) + .getOrElse(Member(address, Removed)) + } + } + override def toString = "Gossip(" + "overview = " + overview + @@ -381,75 +468,25 @@ case class Gossip( case class Heartbeat(from: Address) extends ClusterMessage /** - * INTERNAL API. - * - * Command to [akka.cluster.ClusterHeartbeatSender]], which will send [[akka.cluster.Heartbeat]] - * to the other node. - * Local only, no need to serialize. + * INTERNAL API */ -private[cluster] case class SendHeartbeat(heartbeatMsg: Heartbeat, to: Address, deadline: Deadline) +private[cluster] case class ClusterStats( + receivedGossipCount: Long = 0L, + mergeConflictCount: Long = 0L, + mergeCount: Long = 0L, + mergeDetectedCount: Long = 0L) { -/** - * INTERNAL API. - * - * Manages routing of the different cluster commands. - * Instantiated as a single instance for each Cluster - e.g. commands are serialized - * to Cluster message after message, but concurrent with other types of messages. - */ -private[cluster] final class ClusterCommandDaemon(cluster: Cluster) extends Actor { - import ClusterUserAction._ - import ClusterLeaderAction._ + def incrementReceivedGossipCount(): ClusterStats = + copy(receivedGossipCount = receivedGossipCount + 1) - val log = Logging(context.system, this) + def incrementMergeConflictCount(): ClusterStats = + copy(mergeConflictCount = mergeConflictCount + 1) - def receive = { - case JoinSeedNode ⇒ joinSeedNode() - case InitJoin ⇒ sender ! InitJoinAck(cluster.selfAddress) - case InitJoinAck(address) ⇒ cluster.join(address) - case Join(address) ⇒ cluster.joining(address) - case Down(address) ⇒ cluster.downing(address) - case Leave(address) ⇒ cluster.leaving(address) - case Exit(address) ⇒ cluster.exiting(address) - case Remove(address) ⇒ cluster.removing(address) - case Failure(e: AskTimeoutException) ⇒ joinSeedNodeTimeout() - } + def incrementMergeCount(): ClusterStats = + copy(mergeCount = mergeCount + 1) - def joinSeedNode(): Unit = { - val seedRoutees = for (address ← cluster.seedNodes; if address != cluster.selfAddress) - yield self.path.toStringWithAddress(address) - if (seedRoutees.isEmpty) { - cluster join cluster.selfAddress - } else { - implicit val within = Timeout(cluster.settings.SeedNodeTimeout) - val seedRouter = context.actorOf( - Props.empty.withRouter(ScatterGatherFirstCompletedRouter( - routees = seedRoutees, within = within.duration))) - seedRouter ? InitJoin pipeTo self - seedRouter ! PoisonPill - } - } - - def joinSeedNodeTimeout(): Unit = cluster join cluster.selfAddress - - override def unhandled(unknown: Any) = log.error("Illegal command [{}]", unknown) -} - -/** - * INTERNAL API. - * - * Receives Gossip messages and delegates to Cluster. - * Instantiated as a single instance for each Cluster - e.g. gossips are serialized - * to Cluster message after message, but concurrent with other types of messages. - */ -private[cluster] final class ClusterGossipDaemon(cluster: Cluster) extends Actor with ActorLogging { - - def receive = { - case msg: GossipEnvelope ⇒ cluster.receiveGossip(msg) - case msg: GossipMergeConflict ⇒ cluster.receiveGossipMerge(msg) - } - - override def unhandled(unknown: Any) = log.error("[{}] can not respond to messages - received [{}]", - self.path, unknown) + def incrementMergeDetectedCount(): ClusterStats = + copy(mergeDetectedCount = mergeDetectedCount + 1) } /** @@ -462,11 +499,9 @@ private[cluster] final class ClusterGossipDaemon(cluster: Cluster) extends Actor private[cluster] final class ClusterHeartbeatDaemon(cluster: Cluster) extends Actor with ActorLogging { def receive = { - case Heartbeat(from) ⇒ cluster.receiveHeartbeat(from) + case Heartbeat(from) ⇒ cluster.failureDetector heartbeat from } - override def unhandled(unknown: Any) = log.error("[{}] can not respond to messages - received [{}]", - self.path, unknown) } /* @@ -478,6 +513,8 @@ private[cluster] final class ClusterHeartbeatDaemon(cluster: Cluster) extends Ac */ private[cluster] final class ClusterHeartbeatSender(cluster: Cluster) extends Actor with ActorLogging { + import InternalClusterAction._ + /** * Looks up and returns the remote cluster heartbeat connection for the specific address. */ @@ -522,6 +559,8 @@ private[cluster] final class ClusterHeartbeatSenderWorker( cbSettings: CircuitBreakerSettings, toRef: ActorRef) extends Actor with ActorLogging { + import InternalClusterAction._ + val breaker = CircuitBreaker(context.system.scheduler, cbSettings.maxFailures, cbSettings.callTimeout, cbSettings.resetTimeout). onHalfOpen(log.debug("CircuitBreaker Half-Open for: [{}]", toRef)). @@ -532,6 +571,7 @@ private[cluster] final class ClusterHeartbeatSenderWorker( def receive = { case SendHeartbeat(heartbeatMsg, _, deadline) ⇒ + log.debug("Cluster Node [{}] - Heartbeat to [{}]", heartbeatMsg.from, toRef) if (!deadline.isOverdue) { // the CircuitBreaker will measure elapsed time and open if too many long calls try breaker.withSyncCircuitBreaker { @@ -551,394 +591,159 @@ private[cluster] final class ClusterHeartbeatSenderWorker( /** * INTERNAL API. - * - * Supervisor managing the different Cluster daemons. */ -private[cluster] final class ClusterDaemonSupervisor(cluster: Cluster) extends Actor with ActorLogging { - - val configuredDispatcher = cluster.settings.UseDispatcher - val commands = context.actorOf(Props(new ClusterCommandDaemon(cluster)). - withDispatcher(configuredDispatcher), name = "commands") - val gossip = context.actorOf(Props(new ClusterGossipDaemon(cluster)). - withDispatcher(configuredDispatcher). - withRouter(RoundRobinRouter(cluster.settings.NrOfGossipDaemons)), - name = "gossip") - val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(cluster)). - withDispatcher(configuredDispatcher), name = "heartbeat") - val heartbeatSender = context.actorOf(Props(new ClusterHeartbeatSender(cluster)). - withDispatcher(configuredDispatcher), name = "heartbeatSender") - - def receive = Actor.emptyBehavior - - override def unhandled(unknown: Any): Unit = log.error("[{}] can not respond to messages - received [{}]", - self.path, unknown) -} - -/** - * Cluster Extension Id and factory for creating Cluster extension. - * Example: - * {{{ - * if (Cluster(system).isLeader) { ... } - * }}} - */ -object Cluster extends ExtensionId[Cluster] with ExtensionIdProvider { - override def get(system: ActorSystem): Cluster = super.get(system) - - override def lookup = Cluster - - override def createExtension(system: ExtendedActorSystem): Cluster = { - val clusterSettings = new ClusterSettings(system.settings.config, system.name) - - val failureDetector = { - import clusterSettings.{ FailureDetectorImplementationClass ⇒ fqcn } - system.dynamicAccess.createInstanceFor[FailureDetector]( - fqcn, Seq(classOf[ActorSystem] -> system, classOf[ClusterSettings] -> clusterSettings)).fold( - e ⇒ throw new ConfigurationException("Could not create custom failure detector [" + fqcn + "] due to:" + e.toString), - identity) - } - - new Cluster(system, failureDetector) - } -} - -/** - * Interface for the cluster JMX MBean. - */ -trait ClusterNodeMBean { - def getMemberStatus: String - def getClusterStatus: String - def getLeader: String - - def isSingleton: Boolean - def isConvergence: Boolean - def isAvailable: Boolean - def isRunning: Boolean - - def join(address: String) - def leave(address: String) - def down(address: String) -} - -/** - * This module is responsible for Gossiping cluster information. The abstraction maintains the list of live - * and dead members. Periodically i.e. every 1 second this module chooses a random member and initiates a round - * of Gossip with it. - *

- * During each of these runs the member initiates gossip exchange according to following rules: - *

- *   1) Gossip to random live member (if any)
- *   2) If the member gossiped to at (1) was not deputy, or the number of live members is less than number of deputy list,
- *       gossip to random deputy with certain probability depending on number of unreachable, deputy and live members.
- * 
- * - * Example: - * {{{ - * if (Cluster(system).isLeader) { ... } - * }}} - */ -class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) extends Extension { clusterNode ⇒ +private[cluster] final class ClusterCoreSender(selfAddress: Address) extends Actor with ActorLogging { + import InternalClusterAction._ /** - * Represents the state for this Cluster. Implemented using optimistic lockless concurrency. - * All state is represented by this immutable case class and managed by an AtomicReference. + * Looks up and returns the remote cluster command connection for the specific address. */ - private case class State( - latestGossip: Gossip, - joinInProgress: Map[Address, Deadline] = Map.empty, - memberMembershipChangeListeners: Set[MembershipChangeListener] = Set.empty) + private def clusterCoreConnectionFor(address: Address): ActorRef = + context.system.actorFor(RootActorPath(address) / "system" / "cluster" / "core") - if (!system.provider.isInstanceOf[RemoteActorRefProvider]) - throw new ConfigurationException("ActorSystem[" + system + "] needs to have a 'RemoteActorRefProvider' enabled in the configuration") - - private val remote: RemoteActorRefProvider = system.provider.asInstanceOf[RemoteActorRefProvider] - - val remoteSettings = new RemoteSettings(system.settings.config, system.name) - val settings = new ClusterSettings(system.settings.config, system.name) - import settings._ - - val selfAddress = remote.transport.address - private val selfHeartbeat = Heartbeat(selfAddress) - - private val vclockNode = VectorClock.Node(selfAddress.toString) - - implicit private val defaultTimeout = Timeout(remoteSettings.RemoteSystemDaemonAckTimeout) - - private val serialization = remote.serialization - - private val _isRunning = new AtomicBoolean(true) - private val log = Logging(system, "Node") - - private val mBeanServer = ManagementFactory.getPlatformMBeanServer - private val clusterMBeanName = new ObjectName("akka:type=Cluster") - - log.info("Cluster Node [{}] - is starting up...", selfAddress) - - // create supervisor for daemons under path "/system/cluster" - private val clusterDaemons = { - val createChild = CreateChild(Props(new ClusterDaemonSupervisor(this)). - withDispatcher(UseDispatcher), name = "cluster") - Await.result(system.systemGuardian ? createChild, defaultTimeout.duration) match { - case a: ActorRef ⇒ a - case e: Exception ⇒ throw e - } + def receive = { + case SendClusterMessage(to, msg) ⇒ + log.debug("Cluster Node [{}] - Trying to send [{}] to [{}]", selfAddress, msg.getClass.getSimpleName, to) + clusterCoreConnectionFor(to) ! msg } +} - private def createCleanState: State = { - // note that self is not initially member, - // and the Gossip is not versioned for this 'Node' yet - State(Gossip(members = Gossip.emptyMembers)) - } +/** + * INTERNAL API. + */ +private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with ActorLogging { + // FIXME break up the cluster constructor parameter into something that is easier to test without Cluster + import ClusterLeaderAction._ + import InternalClusterAction._ - private val state = new AtomicReference[State](createCleanState) + import cluster.settings._ + import cluster.selfAddress + import cluster.clusterScheduler - // try to join one of the nodes defined in the 'akka.cluster.seed-nodes' - if (AutoJoin) joinSeedNode() + val vclockNode = VectorClock.Node(selfAddress.toString) + val selfHeartbeat = Heartbeat(selfAddress) - // ======================================================== - // ===================== WORK DAEMONS ===================== - // ======================================================== + // note that self is not initially member, + // and the Gossip is not versioned for this 'Node' yet + var latestGossip: Gossip = Gossip() + var joinInProgress: Map[Address, Deadline] = Map.empty - private val clusterScheduler: Scheduler with Closeable = { - if (system.settings.SchedulerTickDuration > SchedulerTickDuration) { - log.info("Using a dedicated scheduler for cluster. Default scheduler can be used if configured " + - "with 'akka.scheduler.tick-duration' [{} ms] <= 'akka.cluster.scheduler.tick-duration' [{} ms].", - system.settings.SchedulerTickDuration.toMillis, SchedulerTickDuration.toMillis) - val threadFactory = system.threadFactory match { - case tf: MonitorableThreadFactory ⇒ tf.copy(name = tf.name + "-cluster-scheduler") - case tf ⇒ tf - } - val hwt = new HashedWheelTimer(log, - threadFactory, - SchedulerTickDuration, SchedulerTicksPerWheel) - new DefaultScheduler(hwt, log, system.dispatcher) - } else { - // delegate to system.scheduler, but don't close - val systemScheduler = system.scheduler - new Scheduler with Closeable { - // we are using system.scheduler, which we are not responsible for closing - def close(): Unit = () - def schedule(initialDelay: Duration, frequency: Duration, receiver: ActorRef, message: Any): Cancellable = - systemScheduler.schedule(initialDelay, frequency, receiver, message) - def schedule(initialDelay: Duration, frequency: Duration)(f: ⇒ Unit): Cancellable = - systemScheduler.schedule(initialDelay, frequency)(f) - def schedule(initialDelay: Duration, frequency: Duration, runnable: Runnable): Cancellable = - systemScheduler.schedule(initialDelay, frequency, runnable) - def scheduleOnce(delay: Duration, runnable: Runnable): Cancellable = - systemScheduler.scheduleOnce(delay, runnable) - def scheduleOnce(delay: Duration, receiver: ActorRef, message: Any): Cancellable = - systemScheduler.scheduleOnce(delay, receiver, message) - def scheduleOnce(delay: Duration)(f: ⇒ Unit): Cancellable = - systemScheduler.scheduleOnce(delay)(f) - } - } - } + var stats = ClusterStats() + + val heartbeatSender = context.actorOf(Props(new ClusterHeartbeatSender(cluster)). + withDispatcher(UseDispatcher), name = "heartbeatSender") + val coreSender = context.actorOf(Props(new ClusterCoreSender(selfAddress)). + withDispatcher(UseDispatcher), name = "coreSender") // start periodic gossip to random nodes in cluster - private val gossipTask = + val gossipTask = FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(GossipInterval), GossipInterval) { - gossip() + self ! GossipTick } // start periodic heartbeat to all nodes in cluster - private val heartbeatTask = + val heartbeatTask = FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(HeartbeatInterval), HeartbeatInterval) { - heartbeat() + self ! HeartbeatTick } // start periodic cluster failure detector reaping (moving nodes condemned by the failure detector to unreachable list) - private val failureDetectorReaperTask = + val failureDetectorReaperTask = FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(UnreachableNodesReaperInterval), UnreachableNodesReaperInterval) { - reapUnreachableMembers() + self ! ReapUnreachableTick } // start periodic leader action management (only applies for the current leader) private val leaderActionsTask = FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(LeaderActionsInterval), LeaderActionsInterval) { - leaderActions() + self ! LeaderActionsTick } - createMBean() + // start periodic publish of current state + private val publishStateTask: Option[Cancellable] = + if (PublishStateInterval == Duration.Zero) None + else Some(FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(PublishStateInterval), PublishStateInterval) { + self ! PublishStateTick + }) - system.registerOnTermination(shutdown()) - - log.info("Cluster Node [{}] - has started up successfully", selfAddress) - - // ====================================================== - // ===================== PUBLIC API ===================== - // ====================================================== - - def self: Member = { - val gossip = latestGossip - gossip.members - .find(_.address == selfAddress) - .getOrElse { - gossip.overview.unreachable - .find(_.address == selfAddress) - .getOrElse(throw new IllegalStateException("Can't find 'this' Member [" + selfAddress + "] in the cluster membership ring or in the unreachable set")) - } + override def preStart(): Unit = { + if (AutoJoin) self ! InternalClusterAction.JoinSeedNode } - /** - * Returns true if the cluster node is up and running, false if it is shut down. - */ - def isRunning: Boolean = _isRunning.get - - /** - * Latest gossip. - */ - def latestGossip: Gossip = state.get.latestGossip - - /** - * Member status for this node (`MemberStatus`). - * - * NOTE: If the node has been removed from the cluster (and shut down) then it's status is set to the 'REMOVED' tombstone state - * and is no longer present in the node ring or any other part of the gossiping state. However in order to maintain the - * model and the semantics the user would expect, this method will in this situation return `MemberStatus.Removed`. - */ - def status: MemberStatus = { - if (isRunning) self.status - else MemberStatus.Removed + override def postStop(): Unit = { + gossipTask.cancel() + heartbeatTask.cancel() + failureDetectorReaperTask.cancel() + leaderActionsTask.cancel() + publishStateTask foreach { _.cancel() } } - /** - * Is this node the leader? - */ - def isLeader: Boolean = latestGossip.isLeader(selfAddress) + def receive = { + case JoinSeedNode ⇒ joinSeedNode() + case InitJoin ⇒ initJoin() + case InitJoinAck(address) ⇒ join(address) + case Failure(e: AskTimeoutException) ⇒ joinSeedNodeTimeout() + case ClusterUserAction.JoinTo(address) ⇒ join(address) + case ClusterUserAction.Join(address) ⇒ joining(address) + case ClusterUserAction.Down(address) ⇒ downing(address) + case ClusterUserAction.Leave(address) ⇒ leaving(address) + case Exit(address) ⇒ exiting(address) + case Remove(address) ⇒ removing(address) + case msg: GossipEnvelope ⇒ receiveGossip(msg) + case msg: GossipMergeConflict ⇒ receiveGossipMerge(msg) + case GossipTick ⇒ gossip() + case HeartbeatTick ⇒ heartbeat() + case ReapUnreachableTick ⇒ reapUnreachableMembers() + case LeaderActionsTick ⇒ leaderActions() + case SendGossipTo(address) ⇒ gossipTo(address) + case PublishStateTick ⇒ publishState() + case p: Ping ⇒ ping(p) - /** - * Get the address of the current leader. - */ - def leader: Address = latestGossip.leader match { - case Some(x) ⇒ x - case None ⇒ throw new IllegalStateException("There is no leader in this cluster") } - /** - * Is this node a singleton cluster? - */ - def isSingletonCluster: Boolean = isSingletonCluster(state.get) - - /** - * Checks if we have a cluster convergence. - * - * @return Some(convergedGossip) if convergence have been reached and None if not - */ - def convergence: Option[Gossip] = convergence(latestGossip) - - /** - * Returns true if the node is UP or JOINING. - */ - def isAvailable: Boolean = !isUnavailable(state.get) - - /** - * Make it possible to override/configure seedNodes from tests without - * specifying in config. Addresses are unknown before startup time. - */ - def seedNodes: IndexedSeq[Address] = SeedNodes - - /** - * Registers a listener to subscribe to cluster membership changes. - */ - @tailrec - final def registerListener(listener: MembershipChangeListener): Unit = { - val localState = state.get - val newListeners = localState.memberMembershipChangeListeners + listener - val newState = localState copy (memberMembershipChangeListeners = newListeners) - if (!state.compareAndSet(localState, newState)) registerListener(listener) // recur + def joinSeedNode(): Unit = { + val seedRoutees = for (address ← cluster.seedNodes; if address != cluster.selfAddress) + yield self.path.toStringWithAddress(address) + if (seedRoutees.isEmpty) { + cluster join cluster.selfAddress + } else { + implicit val within = Timeout(cluster.settings.SeedNodeTimeout) + val seedRouter = context.actorOf( + Props.empty.withRouter(ScatterGatherFirstCompletedRouter( + routees = seedRoutees, within = within.duration))) + seedRouter ? InitJoin pipeTo self + seedRouter ! PoisonPill + } } - /** - * Unsubscribes to cluster membership changes. - */ - @tailrec - final def unregisterListener(listener: MembershipChangeListener): Unit = { - val localState = state.get - val newListeners = localState.memberMembershipChangeListeners - listener - val newState = localState copy (memberMembershipChangeListeners = newListeners) - if (!state.compareAndSet(localState, newState)) unregisterListener(listener) // recur - } + def initJoin(): Unit = sender ! InitJoinAck(cluster.selfAddress) + + def joinSeedNodeTimeout(): Unit = cluster join cluster.selfAddress /** * Try to join this cluster node with the node specified by 'address'. * A 'Join(thisNodeAddress)' command is sent to the node to join. */ - @tailrec - final def join(address: Address): Unit = { - val localState = state.get + def join(address: Address): Unit = { + val localGossip = latestGossip // wipe our state since a node that joins a cluster must be empty - val newState = createCleanState copy (joinInProgress = Map.empty + (address -> (Deadline.now + JoinTimeout)), - memberMembershipChangeListeners = localState.memberMembershipChangeListeners) + latestGossip = Gossip() + joinInProgress = Map.empty + (address -> (Deadline.now + JoinTimeout)) + // wipe the failure detector since we are starting fresh and shouldn't care about the past - failureDetector.reset() - if (!state.compareAndSet(localState, newState)) join(address) // recur - else { - val connection = clusterCommandConnectionFor(address) - val command = ClusterUserAction.Join(selfAddress) - log.info("Cluster Node [{}] - Trying to send JOIN to [{}] through connection [{}]", selfAddress, address, connection) - connection ! command - } + cluster.failureDetector.reset() + + notifyListeners(localGossip) + + val command = ClusterUserAction.Join(selfAddress) + coreSender ! SendClusterMessage(address, command) } /** - * Send command to issue state transition to LEAVING for the node specified by 'address'. - */ - def leave(address: Address): Unit = { - clusterCommandDaemon ! ClusterUserAction.Leave(address) - } - - /** - * Send command to DOWN the node specified by 'address'. - */ - def down(address: Address): Unit = { - clusterCommandDaemon ! ClusterUserAction.Down(address) - } - - // ======================================================== - // ===================== INTERNAL API ===================== - // ======================================================== - - /** - * INTERNAL API. - * - * Shuts down all connections to other members, the cluster daemon and the periodic gossip and cleanup tasks. - * - * Should not called by the user. The user can issue a LEAVE command which will tell the node - * to go through graceful handoff process `LEAVE -> EXITING -> REMOVED -> SHUTDOWN`. - */ - private[cluster] def shutdown(): Unit = { - if (_isRunning.compareAndSet(true, false)) { - log.info("Cluster Node [{}] - Shutting down cluster Node and cluster daemons...", selfAddress) - - // cancel the periodic tasks, note that otherwise they will be run when scheduler is shutdown - gossipTask.cancel() - heartbeatTask.cancel() - failureDetectorReaperTask.cancel() - leaderActionsTask.cancel() - clusterScheduler.close() - - // FIXME isTerminated check can be removed when ticket #2221 is fixed - // now it prevents logging if system is shutdown (or in progress of shutdown) - if (!clusterDaemons.isTerminated) - system.stop(clusterDaemons) - - try { - mBeanServer.unregisterMBean(clusterMBeanName) - } catch { - case e: InstanceNotFoundException ⇒ // ignore - we are running multiple cluster nodes in the same JVM (probably for testing) - } - log.info("Cluster Node [{}] - Cluster node successfully shut down", selfAddress) - } - } - - /** - * INTERNAL API. - * * State transition to JOINING - new node joining. */ - @tailrec - private[cluster] final def joining(node: Address): Unit = { - val localState = state.get - val localGossip = localState.latestGossip + def joining(node: Address): Unit = { + val localGossip = latestGossip val localMembers = localGossip.members val localUnreachable = localGossip.overview.unreachable @@ -952,7 +757,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) val newOverview = localGossip.overview copy (unreachable = newUnreachableMembers) // remove the node from the failure detector if it is a DOWN node that is rejoining cluster - if (rejoiningMember.nonEmpty) failureDetector.remove(node) + if (rejoiningMember.nonEmpty) cluster.failureDetector.remove(node) // add joining node as Joining // add self in case someone else joins before self has joined (Set discards duplicates) @@ -962,30 +767,24 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) val versionedGossip = newGossip :+ vclockNode val seenVersionedGossip = versionedGossip seen selfAddress - val newState = localState copy (latestGossip = seenVersionedGossip) + latestGossip = seenVersionedGossip - if (!state.compareAndSet(localState, newState)) joining(node) // recur if we failed update - else { - log.debug("Cluster Node [{}] - Node [{}] is JOINING", selfAddress, node) - // treat join as initial heartbeat, so that it becomes unavailable if nothing more happens - if (node != selfAddress) { - failureDetector heartbeat node - gossipTo(node) - } - notifyMembershipChangeListeners(localState, newState) + log.debug("Cluster Node [{}] - Node [{}] is JOINING", selfAddress, node) + // treat join as initial heartbeat, so that it becomes unavailable if nothing more happens + if (node != selfAddress) { + cluster.failureDetector heartbeat node + gossipTo(node) } + + notifyListeners(localGossip) } } /** - * INTERNAL API. - * * State transition to LEAVING. */ - @tailrec - private[cluster] final def leaving(address: Address) { - val localState = state.get - val localGossip = localState.latestGossip + def leaving(address: Address): Unit = { + val localGossip = latestGossip if (localGossip.members.exists(_.address == address)) { // only try to update if the node is available (in the member ring) val newMembers = localGossip.members map { member ⇒ if (member.address == address) Member(address, Leaving) else member } // mark node as LEAVING val newGossip = localGossip copy (members = newMembers) @@ -993,29 +792,23 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) val versionedGossip = newGossip :+ vclockNode val seenVersionedGossip = versionedGossip seen selfAddress - val newState = localState copy (latestGossip = seenVersionedGossip) + latestGossip = seenVersionedGossip - if (!state.compareAndSet(localState, newState)) leaving(address) // recur if we failed update - else { - log.info("Cluster Node [{}] - Marked address [{}] as LEAVING", selfAddress, address) - notifyMembershipChangeListeners(localState, newState) - } + log.info("Cluster Node [{}] - Marked address [{}] as LEAVING", selfAddress, address) + publishState() + notifyListeners(localGossip) } } /** - * INTERNAL API. - * * State transition to EXITING. */ - private[cluster] final def exiting(address: Address): Unit = { + def exiting(address: Address): Unit = { log.info("Cluster Node [{}] - Marked node [{}] as EXITING", selfAddress, address) // FIXME implement when we implement hand-off } /** - * INTERNAL API. - * * State transition to REMOVED. * * This method is for now only called after the LEADER have sent a Removed message - telling the node @@ -1024,24 +817,21 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) * In the future we might change this to allow the USER to send a Removed(address) message telling an * arbitrary node to be moved direcly from UP -> REMOVED. */ - private[cluster] final def removing(address: Address): Unit = { + def removing(address: Address): Unit = { log.info("Cluster Node [{}] - Node has been REMOVED by the leader - shutting down...", selfAddress) - shutdown() + publishState() + cluster.shutdown() } /** - * INTERNAL API. - * * The node to DOWN is removed from the 'members' set and put in the 'unreachable' set (if not already there) * and its status is set to DOWN. The node is also removed from the 'seen' table. * * The node will reside as DOWN in the 'unreachable' set until an explicit command JOIN command is sent directly * to this node and it will then go through the normal JOINING procedure. */ - @tailrec - final private[cluster] def downing(address: Address): Unit = { - val localState = state.get - val localGossip = localState.latestGossip + def downing(address: Address): Unit = { + val localGossip = latestGossip val localMembers = localGossip.members val localOverview = localGossip.overview val localSeen = localOverview.seen @@ -1080,44 +870,12 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) val newOverview = localOverview copy (seen = newSeen, unreachable = newUnreachablePlusNewlyDownedMembers) val newGossip = localGossip copy (overview = newOverview, members = newMembers) // update gossip val versionedGossip = newGossip :+ vclockNode - val newState = localState copy (latestGossip = versionedGossip seen selfAddress) + latestGossip = versionedGossip seen selfAddress - if (!state.compareAndSet(localState, newState)) downing(address) // recur if we fail the update - else { - notifyMembershipChangeListeners(localState, newState) - } + notifyListeners(localGossip) } - // Can be removed when gossip has been optimized - private val _receivedGossipCount = new AtomicLong - /** - * INTERNAL API. - */ - private[cluster] def receivedGossipCount: Long = _receivedGossipCount.get - - /** - * INTERNAL API. - */ - private[cluster] def mergeCount: Long = _mergeCount.get - - // Can be removed when gossip has been optimized - private val _mergeCount = new AtomicLong - - /** - * INTERNAL API. - */ - private[cluster] def mergeDetectedCount: Long = _mergeDetectedCount.get - - // Can be removed when gossip has been optimized - private val _mergeDetectedCount = new AtomicLong - - private val _mergeConflictCount = new AtomicLong - private def mergeRate(count: Long): Double = (count * 1000.0) / GossipInterval.toMillis - - /** - * INTERNAL API. - * * When conflicting versions of received and local [[akka.cluster.Gossip]] is detected * it's forwarded to the leader for conflict resolution. Trying to simultaneously * resolving conflicts at several nodes creates new conflicts. Therefore the leader resolves @@ -1125,9 +883,9 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) * limit of how many conflicts that are handled by second. If the limit is * exceeded the conflicting gossip messages are dropped and will reappear later. */ - private[cluster] def receiveGossipMerge(merge: GossipMergeConflict): Unit = { - val count = _mergeConflictCount.incrementAndGet - val rate = mergeRate(count) + def receiveGossipMerge(merge: GossipMergeConflict): Unit = { + stats = stats.incrementMergeConflictCount + val rate = mergeRate(stats.mergeConflictCount) if (rate <= MaxGossipMergeRate) { receiveGossip(merge.a.copy(conversation = false)) receiveGossip(merge.b.copy(conversation = false)) @@ -1147,16 +905,12 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) } /** - * INTERNAL API. - * * Receive new gossip. */ - @tailrec - final private[cluster] def receiveGossip(envelope: GossipEnvelope): Unit = { + def receiveGossip(envelope: GossipEnvelope): Unit = { val from = envelope.from val remoteGossip = envelope.gossip - val localState = state.get - val localGossip = localState.latestGossip + val localGossip = latestGossip if (remoteGossip.overview.unreachable.exists(_.address == selfAddress)) { // FIXME how should we handle this situation? @@ -1172,13 +926,15 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) // delegate merge resolution to leader to reduce number of simultaneous resolves, // which will result in new conflicts - log.debug("Merge conflict [{}] detected [{}] <> [{}]", _mergeDetectedCount.incrementAndGet, selfAddress, from) + stats = stats.incrementMergeDetectedCount + log.debug("Merge conflict [{}] detected [{}] <> [{}]", stats.mergeDetectedCount, selfAddress, from) - val count = _mergeConflictCount.incrementAndGet - val rate = mergeRate(count) + stats = stats.incrementMergeConflictCount + val rate = mergeRate(stats.mergeConflictCount) if (rate <= MaxGossipMergeRate) { - val leaderConnection = clusterGossipConnectionFor(localGossip.leader.get) - leaderConnection ! GossipMergeConflict(GossipEnvelope(selfAddress, localGossip), envelope) + coreSender ! SendClusterMessage( + to = localGossip.leader.get, + msg = GossipMergeConflict(GossipEnvelope(selfAddress, localGossip), envelope)) } else { log.debug("Skipping gossip merge conflict due to rate [{}] / s ", rate) } @@ -1211,116 +967,53 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) } val newJoinInProgress = - if (localState.joinInProgress.isEmpty) localState.joinInProgress - else localState.joinInProgress -- + if (joinInProgress.isEmpty) joinInProgress + else joinInProgress -- winningGossip.members.map(_.address) -- winningGossip.overview.unreachable.map(_.address) - val newState = localState copy ( - latestGossip = winningGossip seen selfAddress, - joinInProgress = newJoinInProgress) + latestGossip = winningGossip seen selfAddress + joinInProgress = newJoinInProgress - // for all new joining nodes we optimistically remove them from the failure detector, since if we wait until - // we have won the CAS, then the node might be picked up by the reapUnreachableMembers task and moved to - // unreachable before we can remove the node from the failure detector - (newState.latestGossip.members -- localState.latestGossip.members).filter(_.status == Joining).foreach { - case node ⇒ failureDetector.remove(node.address) + // for all new joining nodes we remove them from the failure detector + (latestGossip.members -- localGossip.members).filter(_.status == Joining).foreach { + case node ⇒ cluster.failureDetector.remove(node.address) } - // if we won the race then update else try again - if (!state.compareAndSet(localState, newState)) receiveGossip(envelope) // recur if we fail the update - else { - log.debug("Cluster Node [{}] - Receiving gossip from [{}]", selfAddress, from) + log.debug("Cluster Node [{}] - Receiving gossip from [{}]", selfAddress, from) - if (conflict) { - _mergeCount.incrementAndGet - log.debug( - """Couldn't establish a causal relationship between "remote" gossip and "local" gossip - Remote[{}] - Local[{}] - merged them into [{}]""", - remoteGossip, localGossip, winningGossip) - } + if (conflict) { + stats = stats.incrementMergeCount + log.debug( + """Couldn't establish a causal relationship between "remote" gossip and "local" gossip - Remote[{}] - Local[{}] - merged them into [{}]""", + remoteGossip, localGossip, winningGossip) + } - _receivedGossipCount.incrementAndGet() - notifyMembershipChangeListeners(localState, newState) + stats = stats.incrementReceivedGossipCount + notifyListeners(localGossip) - if (envelope.conversation && - (conflict || (winningGossip ne remoteGossip) || (newState.latestGossip ne remoteGossip))) { - // send back gossip to sender when sender had different view, i.e. merge, or sender had - // older or sender had newer - gossipTo(from) - } + if (envelope.conversation && + (conflict || (winningGossip ne remoteGossip) || (latestGossip ne remoteGossip))) { + // send back gossip to sender when sender had different view, i.e. merge, or sender had + // older or sender had newer + gossipTo(from) } } } } - /** - * INTERNAL API. - */ - private[cluster] def receiveHeartbeat(from: Address): Unit = failureDetector heartbeat from + def mergeRate(count: Long): Double = (count * 1000.0) / GossipInterval.toMillis /** - * Joins the pre-configured contact points. + * Initiates a new round of gossip. */ - private def joinSeedNode(): Unit = clusterCommandDaemon ! ClusterUserAction.JoinSeedNode - - /** - * INTERNAL API. - * - * Gossips latest gossip to an address. - */ - private[cluster] def gossipTo(address: Address): Unit = - gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = true)) - - /** - * INTERNAL API. - */ - private[cluster] def oneWayGossipTo(address: Address): Unit = - gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = false)) - - private def gossipTo(address: Address, gossipMsg: GossipEnvelope): Unit = if (address != selfAddress) { - val connection = clusterGossipConnectionFor(address) - log.debug("Cluster Node [{}] - Gossiping to [{}]", selfAddress, connection) - connection ! gossipMsg - } - - /** - * Gossips latest gossip to a random member in the set of members passed in as argument. - * - * @return the used [[akka.actor.Address] if any - */ - private def gossipToRandomNodeOf(addresses: IndexedSeq[Address]): Option[Address] = { - log.debug("Cluster Node [{}] - Selecting random node to gossip to [{}]", selfAddress, addresses.mkString(", ")) - val peers = addresses filterNot (_ == selfAddress) // filter out myself - val peer = selectRandomNode(peers) - peer foreach gossipTo - peer - } - - /** - * INTERNAL API. - */ - private[cluster] def gossipToDeputyProbablity(membersSize: Int, unreachableSize: Int, nrOfDeputyNodes: Int): Double = { - if (nrOfDeputyNodes > membersSize) 1.0 - else if (nrOfDeputyNodes == 0) 0.0 - else (membersSize + unreachableSize) match { - case 0 ⇒ 0.0 - case sum ⇒ (nrOfDeputyNodes + unreachableSize).toDouble / sum - } - } - - /** - * INTERNAL API. - * - * Initates a new round of gossip. - */ - private[cluster] def gossip(): Unit = { - val localState = state.get - _mergeConflictCount.set(0) + def gossip(): Unit = { + stats = stats.copy(mergeConflictCount = 0) log.debug("Cluster Node [{}] - Initiating new round of gossip", selfAddress) - if (!isSingletonCluster(localState) && isAvailable(localState)) { - val localGossip = localState.latestGossip + if (!isSingletonCluster && isAvailable) { + val localGossip = latestGossip // important to not accidentally use `map` of the SortedSet, since the original order is not preserved val localMembers = localGossip.members.toIndexedSeq val localMembersSize = localMembers.size @@ -1357,94 +1050,15 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) } /** - * INTERNAL API. - */ - private[cluster] def heartbeat(): Unit = { - removeOverdueJoinInProgress() - val localState = state.get - - val beatTo = localState.latestGossip.members.toSeq.map(_.address) ++ localState.joinInProgress.keys - - val deadline = Deadline.now + HeartbeatInterval - for (address ← beatTo; if address != selfAddress) - clusterHeartbeatSender ! SendHeartbeat(selfHeartbeat, address, deadline) - } - - /** - * INTERNAL API. - * - * Reaps the unreachable members (moves them to the 'unreachable' list in the cluster overview) according to the failure detector's verdict. - */ - @tailrec - final private[cluster] def reapUnreachableMembers(): Unit = { - val localState = state.get - - if (!isSingletonCluster(localState) && isAvailable(localState)) { - // only scrutinize if we are a non-singleton cluster and available - - val localGossip = localState.latestGossip - val localOverview = localGossip.overview - val localMembers = localGossip.members - val localUnreachableMembers = localGossip.overview.unreachable - - val newlyDetectedUnreachableMembers = localMembers filterNot { member ⇒ failureDetector.isAvailable(member.address) } - - if (newlyDetectedUnreachableMembers.nonEmpty) { - - val newMembers = localMembers -- newlyDetectedUnreachableMembers - val newUnreachableMembers = localUnreachableMembers ++ newlyDetectedUnreachableMembers - - val newOverview = localOverview copy (unreachable = newUnreachableMembers) - val newGossip = localGossip copy (overview = newOverview, members = newMembers) - - // updating vclock and 'seen' table - val versionedGossip = newGossip :+ vclockNode - val seenVersionedGossip = versionedGossip seen selfAddress - - val newState = localState copy (latestGossip = seenVersionedGossip) - - // if we won the race then update else try again - if (!state.compareAndSet(localState, newState)) reapUnreachableMembers() // recur - else { - log.error("Cluster Node [{}] - Marking node(s) as UNREACHABLE [{}]", selfAddress, newlyDetectedUnreachableMembers.mkString(", ")) - - notifyMembershipChangeListeners(localState, newState) - } - } - } - } - - /** - * INTERNAL API. - * - * Removes overdue joinInProgress from State. - */ - @tailrec - final private[cluster] def removeOverdueJoinInProgress(): Unit = { - val localState = state.get - val overdueJoins = localState.joinInProgress collect { - case (address, deadline) if deadline.isOverdue ⇒ address - } - if (overdueJoins.nonEmpty) { - val newState = localState copy (joinInProgress = localState.joinInProgress -- overdueJoins) - if (!state.compareAndSet(localState, newState)) removeOverdueJoinInProgress() // recur - } - } - - /** - * INTERNAL API. - * * Runs periodic leader actions, such as auto-downing unreachable nodes, assigning partitions etc. */ - @tailrec - final private[cluster] def leaderActions(): Unit = { - val localState = state.get - val localGossip = localState.latestGossip + def leaderActions(): Unit = { + val localGossip = latestGossip val localMembers = localGossip.members val isLeader = localMembers.nonEmpty && (selfAddress == localMembers.head.address) - if (isLeader && isAvailable(localState)) { + if (isLeader && isAvailable) { // only run the leader actions if we are the LEADER and available val localOverview = localGossip.overview @@ -1475,7 +1089,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) removedMembers, unreachableButNotDownedMembers) = - if (convergence(localGossip).isDefined) { + if (localGossip.convergence) { // we have convergence - so we can't have unreachable nodes // transform the node member ring - filterNot/map/map @@ -1568,193 +1182,485 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) if (removedMembers.exists(_.address == selfAddress)) versionedGossip else versionedGossip seen selfAddress - val newState = localState copy (latestGossip = seenVersionedGossip) + // ---------------------- + // 8. Update the state with the new gossip + // ---------------------- + latestGossip = seenVersionedGossip // ---------------------- - // 8. Try to update the state with the new gossip + // 9. Run all the side-effecting processing // ---------------------- - if (!state.compareAndSet(localState, newState)) { - // ---------------------- - // 9. Failure - retry - // ---------------------- - leaderActions() // recur + // log the move of members from joining to up + upMembers foreach { member ⇒ log.info("Cluster Node [{}] - Leader is moving node [{}] from JOINING to UP", selfAddress, member.address) } - } else { - // ---------------------- - // 10. Success - run all the side-effecting processing - // ---------------------- - - // log the move of members from joining to up - upMembers foreach { member ⇒ log.info("Cluster Node [{}] - Leader is moving node [{}] from JOINING to UP", selfAddress, member.address) } - - // tell all removed members to remove and shut down themselves - removedMembers foreach { member ⇒ - val address = member.address - log.info("Cluster Node [{}] - Leader is moving node [{}] from EXITING to REMOVED - and removing node from node ring", selfAddress, address) - clusterCommandConnectionFor(address) ! ClusterLeaderAction.Remove(address) - } - - // tell all exiting members to exit - exitingMembers foreach { member ⇒ - val address = member.address - log.info("Cluster Node [{}] - Leader is moving node [{}] from LEAVING to EXITING", selfAddress, address) - clusterCommandConnectionFor(address) ! ClusterLeaderAction.Exit(address) // FIXME should use ? to await completion of handoff? - } - - // log the auto-downing of the unreachable nodes - unreachableButNotDownedMembers foreach { member ⇒ - log.info("Cluster Node [{}] - Leader is marking unreachable node [{}] as DOWN", selfAddress, member.address) - } - - notifyMembershipChangeListeners(localState, newState) + // tell all removed members to remove and shut down themselves + removedMembers foreach { member ⇒ + val address = member.address + log.info("Cluster Node [{}] - Leader is moving node [{}] from EXITING to REMOVED - and removing node from node ring", selfAddress, address) + coreSender ! SendClusterMessage( + to = address, + msg = ClusterLeaderAction.Remove(address)) } + + // tell all exiting members to exit + exitingMembers foreach { member ⇒ + val address = member.address + log.info("Cluster Node [{}] - Leader is moving node [{}] from LEAVING to EXITING", selfAddress, address) + coreSender ! SendClusterMessage( + to = address, + msg = ClusterLeaderAction.Exit(address)) // FIXME should use ? to await completion of handoff? + } + + // log the auto-downing of the unreachable nodes + unreachableButNotDownedMembers foreach { member ⇒ + log.info("Cluster Node [{}] - Leader is marking unreachable node [{}] as DOWN", selfAddress, member.address) + } + + notifyListeners(localGossip) } } } + def heartbeat(): Unit = { + removeOverdueJoinInProgress() + + val beatTo = latestGossip.members.toSeq.map(_.address) ++ joinInProgress.keys + + val deadline = Deadline.now + HeartbeatInterval + for (address ← beatTo; if address != selfAddress) + heartbeatSender ! SendHeartbeat(selfHeartbeat, address, deadline) + } + /** - * Checks if we have a cluster convergence. If there are any unreachable nodes then we can't have a convergence - - * waiting for user to act (issuing DOWN) or leader to act (issuing DOWN through auto-down). - * - * @returns Some(convergedGossip) if convergence have been reached and None if not + * Removes overdue joinInProgress from State. */ - private def convergence(gossip: Gossip): Option[Gossip] = { - val overview = gossip.overview - val unreachable = overview.unreachable - val seen = overview.seen - - // First check that: - // 1. we don't have any members that are unreachable, or - // 2. all unreachable members in the set have status DOWN - // Else we can't continue to check for convergence - // When that is done we check that all the entries in the 'seen' table have the same vector clock version - // and that all members exists in seen table - val hasUnreachable = unreachable.nonEmpty && unreachable.exists { _.status != Down } - val allMembersInSeen = gossip.members.forall(m ⇒ seen.contains(m.address)) - - if (hasUnreachable) { - log.debug("Cluster Node [{}] - No cluster convergence, due to unreachable nodes [{}].", selfAddress, unreachable) - None - } else if (!allMembersInSeen) { - log.debug("Cluster Node [{}] - No cluster convergence, due to members not in seen table [{}].", selfAddress, - gossip.members.map(_.address) -- seen.keySet) - None - } else { - - val views = seen.values.toSet.size - - if (views == 1) { - log.debug("Cluster Node [{}] - Cluster convergence reached: [{}]", selfAddress, gossip.members.mkString(", ")) - Some(gossip) - } else { - log.debug("Cluster Node [{}] - No cluster convergence, since not all nodes have seen the same state yet. [{} of {}]", - selfAddress, views, seen.values.size) - None - } + def removeOverdueJoinInProgress(): Unit = { + val overdueJoins = joinInProgress collect { + case (address, deadline) if deadline.isOverdue ⇒ address + } + if (overdueJoins.nonEmpty) { + joinInProgress = joinInProgress -- overdueJoins } } - private def isAvailable(state: State): Boolean = !isUnavailable(state) + /** + * Reaps the unreachable members (moves them to the 'unreachable' list in the cluster overview) according to the failure detector's verdict. + */ + def reapUnreachableMembers(): Unit = { - private def isUnavailable(state: State): Boolean = { - val localGossip = state.latestGossip - val isUnreachable = localGossip.overview.unreachable exists { _.address == selfAddress } - val hasUnavailableMemberStatus = localGossip.members exists { m ⇒ (m == self) && m.status.isUnavailable } - isUnreachable || hasUnavailableMemberStatus + if (!isSingletonCluster && isAvailable) { + // only scrutinize if we are a non-singleton cluster and available + + val localGossip = latestGossip + val localOverview = localGossip.overview + val localMembers = localGossip.members + val localUnreachableMembers = localGossip.overview.unreachable + + val newlyDetectedUnreachableMembers = localMembers filterNot { member ⇒ cluster.failureDetector.isAvailable(member.address) } + + if (newlyDetectedUnreachableMembers.nonEmpty) { + + val newMembers = localMembers -- newlyDetectedUnreachableMembers + val newUnreachableMembers = localUnreachableMembers ++ newlyDetectedUnreachableMembers + + val newOverview = localOverview copy (unreachable = newUnreachableMembers) + val newGossip = localGossip copy (overview = newOverview, members = newMembers) + + // updating vclock and 'seen' table + val versionedGossip = newGossip :+ vclockNode + val seenVersionedGossip = versionedGossip seen selfAddress + + latestGossip = seenVersionedGossip + + log.error("Cluster Node [{}] - Marking node(s) as UNREACHABLE [{}]", selfAddress, newlyDetectedUnreachableMembers.mkString(", ")) + + notifyListeners(localGossip) + } + } } - private def notifyMembershipChangeListeners(oldState: State, newState: State): Unit = { - val oldMembersStatus = oldState.latestGossip.members.map(m ⇒ (m.address, m.status)) - val newMembersStatus = newState.latestGossip.members.map(m ⇒ (m.address, m.status)) - if (newMembersStatus != oldMembersStatus) - newState.memberMembershipChangeListeners foreach { _ notify newState.latestGossip.members } - } - - /** - * Looks up and returns the local cluster command connection. - */ - private def clusterCommandDaemon = system.actorFor(RootActorPath(selfAddress) / "system" / "cluster" / "commands") - - /** - * Looks up and returns the remote cluster command connection for the specific address. - */ - private def clusterCommandConnectionFor(address: Address): ActorRef = system.actorFor(RootActorPath(address) / "system" / "cluster" / "commands") - - /** - * Looks up and returns the remote cluster gossip connection for the specific address. - */ - private def clusterGossipConnectionFor(address: Address): ActorRef = system.actorFor(RootActorPath(address) / "system" / "cluster" / "gossip") - - private def clusterHeartbeatSender: ActorRef = system.actorFor(clusterDaemons.path / "heartbeatSender") - /** * Gets the addresses of a all the 'deputy' nodes - excluding this node if part of the group. */ - private def deputyNodes(addresses: IndexedSeq[Address]): IndexedSeq[Address] = + def deputyNodes(addresses: IndexedSeq[Address]): IndexedSeq[Address] = addresses filterNot (_ == selfAddress) intersect seedNodes - /** - * INTERNAL API. - */ - private[cluster] def selectRandomNode(addresses: IndexedSeq[Address]): Option[Address] = + def seedNodes: IndexedSeq[Address] = cluster.seedNodes + + def selectRandomNode(addresses: IndexedSeq[Address]): Option[Address] = if (addresses.isEmpty) None else Some(addresses(ThreadLocalRandom.current nextInt addresses.size)) - private def isSingletonCluster(currentState: State): Boolean = currentState.latestGossip.members.size == 1 + def isSingletonCluster: Boolean = latestGossip.isSingletonCluster + + def isAvailable: Boolean = latestGossip.isAvailable(selfAddress) /** - * Creates the cluster JMX MBean and registers it in the MBean server. + * Gossips latest gossip to a random member in the set of members passed in as argument. + * + * @return the used [[akka.actor.Address] if any */ - private def createMBean() = { - val mbean = new StandardMBean(classOf[ClusterNodeMBean]) with ClusterNodeMBean { + private def gossipToRandomNodeOf(addresses: IndexedSeq[Address]): Option[Address] = { + log.debug("Cluster Node [{}] - Selecting random node to gossip to [{}]", selfAddress, addresses.mkString(", ")) + val peers = addresses filterNot (_ == selfAddress) // filter out myself + val peer = selectRandomNode(peers) + peer foreach gossipTo + peer + } - // JMX attributes (bean-style) - - /* - * Sends a string to the JMX client that will list all nodes in the node ring as follows: - * {{{ - * Members: - * Member(address = akka://system0@localhost:5550, status = Up) - * Member(address = akka://system1@localhost:5551, status = Up) - * Unreachable: - * Member(address = akka://system2@localhost:5553, status = Down) - * }}} - */ - def getClusterStatus: String = { - val gossip = clusterNode.latestGossip - val unreachable = gossip.overview.unreachable - val metaData = gossip.meta - "\nMembers:\n\t" + gossip.members.mkString("\n\t") + - { if (unreachable.nonEmpty) "\nUnreachable:\n\t" + unreachable.mkString("\n\t") else "" } + - { if (metaData.nonEmpty) "\nMeta Data:\t" + metaData.toString else "" } - } - - def getMemberStatus: String = clusterNode.status.toString - - def getLeader: String = clusterNode.leader.toString - - def isSingleton: Boolean = clusterNode.isSingletonCluster - - def isConvergence: Boolean = clusterNode.convergence.isDefined - - def isAvailable: Boolean = clusterNode.isAvailable - - def isRunning: Boolean = clusterNode.isRunning - - // JMX commands - - def join(address: String) = clusterNode.join(AddressFromURIString(address)) - - def leave(address: String) = clusterNode.leave(AddressFromURIString(address)) - - def down(address: String) = clusterNode.down(AddressFromURIString(address)) - } - log.info("Cluster Node [{}] - registering cluster JMX MBean [{}]", selfAddress, clusterMBeanName) - try { - mBeanServer.registerMBean(mbean, clusterMBeanName) - } catch { - case e: InstanceAlreadyExistsException ⇒ // ignore - we are running multiple cluster nodes in the same JVM (probably for testing) + private[cluster] def gossipToDeputyProbablity(membersSize: Int, unreachableSize: Int, nrOfDeputyNodes: Int): Double = { + if (nrOfDeputyNodes > membersSize) 1.0 + else if (nrOfDeputyNodes == 0) 0.0 + else (membersSize + unreachableSize) match { + case 0 ⇒ 0.0 + case sum ⇒ (nrOfDeputyNodes + unreachableSize).toDouble / sum } } + + /** + * Gossips latest gossip to an address. + */ + def gossipTo(address: Address): Unit = + gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = true)) + + def oneWayGossipTo(address: Address): Unit = + gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = false)) + + def gossipTo(address: Address, gossipMsg: GossipEnvelope): Unit = if (address != selfAddress) { + coreSender ! SendClusterMessage(address, gossipMsg) + } + + def notifyListeners(oldGossip: Gossip): Unit = { + if (PublishStateInterval == Duration.Zero) publishState + + val oldMembersStatus = oldGossip.members.map(m ⇒ (m.address, m.status)) + val newMembersStatus = latestGossip.members.map(m ⇒ (m.address, m.status)) + if (newMembersStatus != oldMembersStatus) + cluster notifyMembershipChangeListeners latestGossip.members + } + + def publishState(): Unit = { + cluster._latestGossip = latestGossip + cluster._latestStats = stats + } + + def ping(p: Ping): Unit = sender ! Pong(p) +} + +/** + * INTERNAL API. + * + * Supervisor managing the different Cluster daemons. + */ +private[cluster] final class ClusterDaemonSupervisor(cluster: Cluster) extends Actor with ActorLogging { + + val configuredDispatcher = cluster.settings.UseDispatcher + val core = context.actorOf(Props(new ClusterCore(cluster)). + withDispatcher(configuredDispatcher), name = "core") + val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(cluster)). + withDispatcher(configuredDispatcher), name = "heartbeat") + + def receive = Actor.emptyBehavior + + override def unhandled(unknown: Any): Unit = log.error("[{}] can not respond to messages - received [{}]", + self.path, unknown) +} + +/** + * Cluster Extension Id and factory for creating Cluster extension. + * Example: + * {{{ + * if (Cluster(system).isLeader) { ... } + * }}} + */ +object Cluster extends ExtensionId[Cluster] with ExtensionIdProvider { + override def get(system: ActorSystem): Cluster = super.get(system) + + override def lookup = Cluster + + override def createExtension(system: ExtendedActorSystem): Cluster = { + val clusterSettings = new ClusterSettings(system.settings.config, system.name) + + val failureDetector = { + import clusterSettings.{ FailureDetectorImplementationClass ⇒ fqcn } + system.dynamicAccess.createInstanceFor[FailureDetector]( + fqcn, Seq(classOf[ActorSystem] -> system, classOf[ClusterSettings] -> clusterSettings)).fold( + e ⇒ throw new ConfigurationException("Could not create custom failure detector [" + fqcn + "] due to:" + e.toString), + identity) + } + + new Cluster(system, failureDetector) + } +} + +/** + * This module is responsible for Gossiping cluster information. The abstraction maintains the list of live + * and dead members. Periodically i.e. every 1 second this module chooses a random member and initiates a round + * of Gossip with it. + *

+ * During each of these runs the member initiates gossip exchange according to following rules: + *

+ *   1) Gossip to random live member (if any)
+ *   2) If the member gossiped to at (1) was not deputy, or the number of live members is less than number of deputy list,
+ *       gossip to random deputy with certain probability depending on number of unreachable, deputy and live members.
+ * 
+ * + * Example: + * {{{ + * if (Cluster(system).isLeader) { ... } + * }}} + */ +class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) extends Extension { clusterNode ⇒ + + /** + * Represents the state for this Cluster. Implemented using optimistic lockless concurrency. + * All state is represented by this immutable case class and managed by an AtomicReference. + */ + private case class State(memberMembershipChangeListeners: Set[MembershipChangeListener] = Set.empty) + + if (!system.provider.isInstanceOf[RemoteActorRefProvider]) + throw new ConfigurationException("ActorSystem[" + system + "] needs to have a 'RemoteActorRefProvider' enabled in the configuration") + + private val remote: RemoteActorRefProvider = system.provider.asInstanceOf[RemoteActorRefProvider] + + val remoteSettings = new RemoteSettings(system.settings.config, system.name) + val settings = new ClusterSettings(system.settings.config, system.name) + import settings._ + + val selfAddress = remote.transport.address + + private val _isRunning = new AtomicBoolean(true) + private val log = Logging(system, "Cluster") + + log.info("Cluster Node [{}] - is starting up...", selfAddress) + + private val state = new AtomicReference[State](State()) + + /** + * Read only view of cluster state, updated periodically by + * ClusterCore. Access with `latestGossip`. + */ + @volatile + private[cluster] var _latestGossip: Gossip = Gossip() + + /** + * INTERNAL API + * Read only view of internal cluster stats, updated periodically by + * ClusterCore. Access with `latestStats`. + */ + @volatile + private[cluster] var _latestStats = ClusterStats() + + // ======================================================== + // ===================== WORK DAEMONS ===================== + // ======================================================== + + /** + * INTERNAL API + */ + private[cluster] val clusterScheduler: Scheduler with Closeable = { + // FIXME consider moving clusterScheduler to ClusterCore actor + if (system.settings.SchedulerTickDuration > SchedulerTickDuration) { + log.info("Using a dedicated scheduler for cluster. Default scheduler can be used if configured " + + "with 'akka.scheduler.tick-duration' [{} ms] <= 'akka.cluster.scheduler.tick-duration' [{} ms].", + system.settings.SchedulerTickDuration.toMillis, SchedulerTickDuration.toMillis) + val threadFactory = system.threadFactory match { + case tf: MonitorableThreadFactory ⇒ tf.copy(name = tf.name + "-cluster-scheduler") + case tf ⇒ tf + } + val hwt = new HashedWheelTimer(log, + threadFactory, + SchedulerTickDuration, SchedulerTicksPerWheel) + new DefaultScheduler(hwt, log, system.dispatcher) + } else { + // delegate to system.scheduler, but don't close + val systemScheduler = system.scheduler + new Scheduler with Closeable { + // we are using system.scheduler, which we are not responsible for closing + def close(): Unit = () + def schedule(initialDelay: Duration, frequency: Duration, receiver: ActorRef, message: Any): Cancellable = + systemScheduler.schedule(initialDelay, frequency, receiver, message) + def schedule(initialDelay: Duration, frequency: Duration)(f: ⇒ Unit): Cancellable = + systemScheduler.schedule(initialDelay, frequency)(f) + def schedule(initialDelay: Duration, frequency: Duration, runnable: Runnable): Cancellable = + systemScheduler.schedule(initialDelay, frequency, runnable) + def scheduleOnce(delay: Duration, runnable: Runnable): Cancellable = + systemScheduler.scheduleOnce(delay, runnable) + def scheduleOnce(delay: Duration, receiver: ActorRef, message: Any): Cancellable = + systemScheduler.scheduleOnce(delay, receiver, message) + def scheduleOnce(delay: Duration)(f: ⇒ Unit): Cancellable = + systemScheduler.scheduleOnce(delay)(f) + } + } + } + + // create supervisor for daemons under path "/system/cluster" + private val clusterDaemons: ActorRef = { + implicit val timeout = Timeout(remoteSettings.RemoteSystemDaemonAckTimeout) + val createChild = CreateChild(Props(new ClusterDaemonSupervisor(this)). + withDispatcher(UseDispatcher), name = "cluster") + Await.result(system.systemGuardian ? createChild, timeout.duration) match { + case a: ActorRef ⇒ a + case e: Exception ⇒ throw e + } + } + + /** + * INTERNAL API + */ + private[cluster] def clusterCore: ActorRef = + system.actorFor(clusterDaemons.path / "core") + + system.registerOnTermination(shutdown()) + + log.info("Cluster Node [{}] - has started up successfully", selfAddress) + + // ====================================================== + // ===================== PUBLIC API ===================== + // ====================================================== + + def self: Member = latestGossip.member(selfAddress) + + /** + * Returns true if the cluster node is up and running, false if it is shut down. + */ + def isRunning: Boolean = _isRunning.get + + /** + * Latest gossip. + */ + def latestGossip: Gossip = _latestGossip + + /** + * Member status for this node ([[akka.cluster.MemberStatus]]). + * + * NOTE: If the node has been removed from the cluster (and shut down) then it's status is set to the 'REMOVED' tombstone state + * and is no longer present in the node ring or any other part of the gossiping state. However in order to maintain the + * model and the semantics the user would expect, this method will in this situation return `MemberStatus.Removed`. + */ + def status: MemberStatus = self.status + + /** + * Is this node the leader? + */ + def isLeader: Boolean = latestGossip.isLeader(selfAddress) + + /** + * Get the address of the current leader. + */ + def leader: Address = latestGossip.leader match { + case Some(x) ⇒ x + case None ⇒ throw new IllegalStateException("There is no leader in this cluster") + } + + /** + * Is this node a singleton cluster? + */ + def isSingletonCluster: Boolean = latestGossip.isSingletonCluster + + /** + * Checks if we have a cluster convergence. + * + * @return Some(convergedGossip) if convergence have been reached and None if not + */ + def convergence: Option[Gossip] = latestGossip match { + case gossip if gossip.convergence ⇒ Some(gossip) + case _ ⇒ None + } + + /** + * Returns true if the node is UP or JOINING. + */ + def isAvailable: Boolean = latestGossip.isAvailable(selfAddress) + + /** + * Make it possible to override/configure seedNodes from tests without + * specifying in config. Addresses are unknown before startup time. + */ + def seedNodes: IndexedSeq[Address] = SeedNodes + + /** + * Registers a listener to subscribe to cluster membership changes. + */ + @tailrec + final def registerListener(listener: MembershipChangeListener): Unit = { + val localState = state.get + val newListeners = localState.memberMembershipChangeListeners + listener + val newState = localState copy (memberMembershipChangeListeners = newListeners) + if (!state.compareAndSet(localState, newState)) registerListener(listener) // recur + } + + /** + * Unsubscribes to cluster membership changes. + */ + @tailrec + final def unregisterListener(listener: MembershipChangeListener): Unit = { + val localState = state.get + val newListeners = localState.memberMembershipChangeListeners - listener + val newState = localState copy (memberMembershipChangeListeners = newListeners) + if (!state.compareAndSet(localState, newState)) unregisterListener(listener) // recur + } + + /** + * Try to join this cluster node with the node specified by 'address'. + * A 'Join(thisNodeAddress)' command is sent to the node to join. + */ + def join(address: Address): Unit = + clusterCore ! ClusterUserAction.JoinTo(address) + + /** + * Send command to issue state transition to LEAVING for the node specified by 'address'. + */ + def leave(address: Address): Unit = + clusterCore ! ClusterUserAction.Leave(address) + + /** + * Send command to DOWN the node specified by 'address'. + */ + def down(address: Address): Unit = + clusterCore ! ClusterUserAction.Down(address) + + // ======================================================== + // ===================== INTERNAL API ===================== + // ======================================================== + + /** + * INTERNAL API. + * + * Shuts down all connections to other members, the cluster daemon and the periodic gossip and cleanup tasks. + * + * Should not called by the user. The user can issue a LEAVE command which will tell the node + * to go through graceful handoff process `LEAVE -> EXITING -> REMOVED -> SHUTDOWN`. + */ + private[cluster] def shutdown(): Unit = { + if (_isRunning.compareAndSet(true, false)) { + log.info("Cluster Node [{}] - Shutting down cluster Node and cluster daemons...", selfAddress) + + // FIXME isTerminated check can be removed when ticket #2221 is fixed + // now it prevents logging if system is shutdown (or in progress of shutdown) + if (!clusterDaemons.isTerminated) + system.stop(clusterDaemons) + + clusterScheduler.close() + + log.info("Cluster Node [{}] - Cluster node successfully shut down", selfAddress) + } + } + + /** + * INTERNAL API + */ + private[cluster] def notifyMembershipChangeListeners(members: SortedSet[Member]): Unit = { + // FIXME run callbacks async (to not block the cluster) + state.get.memberMembershipChangeListeners foreach { _ notify members } + } + + /** + * INTERNAL API + */ + private[cluster] def latestStats: ClusterStats = _latestStats + + // FIXME add back JMX + } diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala index e9d95de446..d48db5446c 100644 --- a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala @@ -32,8 +32,7 @@ class ClusterSettings(val config: Config, val systemName: String) { final val HeartbeatInterval: Duration = Duration(getMilliseconds("akka.cluster.heartbeat-interval"), MILLISECONDS) final val LeaderActionsInterval: Duration = Duration(getMilliseconds("akka.cluster.leader-actions-interval"), MILLISECONDS) final val UnreachableNodesReaperInterval: Duration = Duration(getMilliseconds("akka.cluster.unreachable-nodes-reaper-interval"), MILLISECONDS) - final val NrOfGossipDaemons: Int = getInt("akka.cluster.nr-of-gossip-daemons") - final val NrOfDeputyNodes: Int = getInt("akka.cluster.nr-of-deputy-nodes") + final val PublishStateInterval: Duration = Duration(getMilliseconds("akka.cluster.publish-state-interval"), MILLISECONDS) final val AutoJoin: Boolean = getBoolean("akka.cluster.auto-join") final val AutoDown: Boolean = getBoolean("akka.cluster.auto-down") final val JoinTimeout: Duration = Duration(getMilliseconds("akka.cluster.join-timeout"), MILLISECONDS) diff --git a/akka-cluster/src/main/scala/akka/cluster/FixedRateTask.scala b/akka-cluster/src/main/scala/akka/cluster/FixedRateTask.scala index 25ef058465..c7799fc5c8 100644 --- a/akka-cluster/src/main/scala/akka/cluster/FixedRateTask.scala +++ b/akka-cluster/src/main/scala/akka/cluster/FixedRateTask.scala @@ -7,9 +7,9 @@ package akka.cluster import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.AtomicBoolean import java.util.concurrent.atomic.AtomicLong - import akka.actor.Scheduler import akka.util.Duration +import akka.actor.Cancellable /** * INTERNAL API @@ -27,7 +27,8 @@ private[akka] object FixedRateTask { * for inaccuracy in scheduler. It will start when constructed, using the * initialDelay. */ -private[akka] class FixedRateTask(scheduler: Scheduler, initalDelay: Duration, delay: Duration, task: Runnable) extends Runnable { +private[akka] class FixedRateTask(scheduler: Scheduler, initalDelay: Duration, delay: Duration, task: Runnable) + extends Runnable with Cancellable { private val delayNanos = delay.toNanos private val cancelled = new AtomicBoolean(false) @@ -37,9 +38,11 @@ private[akka] class FixedRateTask(scheduler: Scheduler, initalDelay: Duration, d def cancel(): Unit = cancelled.set(true) - override final def run(): Unit = if (!cancelled.get) try { + def isCancelled: Boolean = cancelled.get + + override final def run(): Unit = if (!isCancelled) try { task.run() - } finally if (!cancelled.get) { + } finally if (!isCancelled) { val nextTime = startTime + delayNanos * counter.incrementAndGet // it's ok to schedule with negative duration, will run asap val nextDelay = Duration(nextTime - System.nanoTime, TimeUnit.NANOSECONDS) diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/JoinSeedNodeSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/JoinSeedNodeSpec.scala index 20dec26a45..ef52d9e131 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/JoinSeedNodeSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/JoinSeedNodeSpec.scala @@ -36,6 +36,10 @@ abstract class JoinSeedNodeSpec "A cluster with configured seed nodes" must { "start the seed nodes sequentially" taggedAs LongRunningTest in { + // without looking up the addresses first there might be + // [akka://JoinSeedNodeSpec/user/TestConductorClient] cannot write GetAddress(RoleName(seed2)) while waiting for seed1 + roles foreach address + runOn(seed1) { startClusterNode() } diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala index 014983426f..aefc9762a8 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala @@ -36,8 +36,8 @@ object LargeClusterMultiJvmSpec extends MultiNodeConfig { akka.cluster { gossip-interval = 500 ms auto-join = off - nr-of-gossip-daemons = 2 failure-detector.acceptable-heartbeat-pause = 10s + publish-state-interval = 0 s # always, when it happens } akka.loglevel = INFO akka.actor.default-dispatcher.fork-join-executor { @@ -133,8 +133,10 @@ abstract class LargeClusterSpec val clusterNodes = ifNode(from)(joiningClusterNodes)(systems.map(Cluster(_)).toSet) val startGossipCounts = Map.empty[Cluster, Long] ++ - clusterNodes.map(c ⇒ (c -> c.receivedGossipCount)) - def gossipCount(c: Cluster): Long = c.receivedGossipCount - startGossipCounts(c) + clusterNodes.map(c ⇒ (c -> c.latestStats.receivedGossipCount)) + def gossipCount(c: Cluster): Long = { + c.latestStats.receivedGossipCount - startGossipCounts(c) + } val startTime = System.nanoTime def tookMillis: String = TimeUnit.NANOSECONDS.toMillis(System.nanoTime - startTime) + " ms" @@ -259,8 +261,10 @@ abstract class LargeClusterSpec within(30.seconds + (3.seconds * liveNodes)) { val startGossipCounts = Map.empty[Cluster, Long] ++ - systems.map(sys ⇒ (Cluster(sys) -> Cluster(sys).receivedGossipCount)) - def gossipCount(c: Cluster): Long = c.receivedGossipCount - startGossipCounts(c) + systems.map(sys ⇒ (Cluster(sys) -> Cluster(sys).latestStats.receivedGossipCount)) + def gossipCount(c: Cluster): Long = { + c.latestStats.receivedGossipCount - startGossipCounts(c) + } val startTime = System.nanoTime def tookMillis: String = TimeUnit.NANOSECONDS.toMillis(System.nanoTime - startTime) + " ms" @@ -286,7 +290,7 @@ abstract class LargeClusterSpec runOn(firstDatacenter, thirdDatacenter, fourthDatacenter, fifthDatacenter) { Await.ready(latch, remaining) awaitCond(systems.forall(Cluster(_).convergence.isDefined)) - val mergeCount = systems.map(sys ⇒ Cluster(sys).mergeCount).sum + val mergeCount = systems.map(sys ⇒ Cluster(sys).latestStats.mergeCount).sum val counts = systems.map(sys ⇒ gossipCount(Cluster(sys))) val formattedStats = "mean=%s min=%s max=%s".format(counts.sum / nodesPerDatacenter, counts.min, counts.max) log.info("Convergence of [{}] nodes reached after failure, it took [{}], received [{}] gossip messages per node, merged [{}] times", diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala index 3264c661b0..af0b38d447 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala @@ -27,6 +27,7 @@ object MultiNodeClusterSpec { leader-actions-interval = 200 ms unreachable-nodes-reaper-interval = 200 ms periodic-tasks-initial-delay = 300 ms + publish-state-interval = 0 s # always, when it happens } akka.test { single-expect-default = 5 s diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/TransitionSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/TransitionSpec.scala index c4e43b9abf..17c04e5ed0 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/TransitionSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/TransitionSpec.scala @@ -9,8 +9,10 @@ import akka.remote.testkit.MultiNodeConfig import akka.remote.testkit.MultiNodeSpec import akka.testkit._ import akka.actor.Address +import akka.pattern.ask import akka.remote.testconductor.RoleName import MemberStatus._ +import InternalClusterAction._ object TransitionMultiJvmSpec extends MultiNodeConfig { val first = role("first") @@ -28,7 +30,8 @@ class TransitionMultiJvmNode3 extends TransitionSpec with FailureDetectorPuppetS abstract class TransitionSpec extends MultiNodeSpec(TransitionMultiJvmSpec) - with MultiNodeClusterSpec { + with MultiNodeClusterSpec + with ImplicitSender { import TransitionMultiJvmSpec._ @@ -67,6 +70,22 @@ abstract class TransitionSpec memberStatus(address) == status } + def leaderActions(): Unit = { + cluster.clusterCore ! LeaderActionsTick + awaitPing() + } + + def reapUnreachable(): Unit = { + cluster.clusterCore ! ReapUnreachableTick + awaitPing() + } + + def awaitPing(): Unit = { + val ping = Ping() + cluster.clusterCore ! ping + expectMsgPF() { case pong @ Pong(`ping`, _) ⇒ pong } + } + // DSL sugar for `role1 gossipTo role2` implicit def roleExtras(role: RoleName): RoleWrapper = new RoleWrapper(role) var gossipBarrierCounter = 0 @@ -83,7 +102,8 @@ abstract class TransitionSpec } runOn(fromRole) { enterBarrier("before-gossip-" + gossipBarrierCounter) - cluster.gossipTo(toRole) // send gossip + // send gossip + cluster.clusterCore ! InternalClusterAction.SendGossipTo(toRole) // gossip chat will synchronize the views awaitCond((Set(fromRole, toRole) -- seenLatestGossip).isEmpty) enterBarrier("after-gossip-" + gossipBarrierCounter) @@ -104,7 +124,7 @@ abstract class TransitionSpec cluster.isSingletonCluster must be(true) cluster.status must be(Joining) cluster.convergence.isDefined must be(true) - cluster.leaderActions() + leaderActions() cluster.status must be(Up) } @@ -127,7 +147,7 @@ abstract class TransitionSpec enterBarrier("convergence-joining-2") runOn(leader(first, second)) { - cluster.leaderActions() + leaderActions() memberStatus(first) must be(Up) memberStatus(second) must be(Up) } @@ -182,7 +202,7 @@ abstract class TransitionSpec enterBarrier("convergence-joining-3") runOn(leader(first, second, third)) { - cluster.leaderActions() + leaderActions() memberStatus(first) must be(Up) memberStatus(second) must be(Up) memberStatus(third) must be(Up) @@ -200,7 +220,8 @@ abstract class TransitionSpec // first non-leader gossipTo the other non-leader nonLeader(first, second, third).head gossipTo nonLeader(first, second, third).tail.head runOn(nonLeader(first, second, third).head) { - cluster.gossipTo(nonLeader(first, second, third).tail.head) + // send gossip + cluster.clusterCore ! InternalClusterAction.SendGossipTo(nonLeader(first, second, third).tail.head) } runOn(nonLeader(first, second, third).tail.head) { memberStatus(third) must be(Up) @@ -224,7 +245,7 @@ abstract class TransitionSpec "perform correct transitions when second becomes unavailble" taggedAs LongRunningTest in { runOn(third) { markNodeAsUnavailable(second) - cluster.reapUnreachableMembers() + reapUnreachable() cluster.latestGossip.overview.unreachable must contain(Member(second, Up)) seenLatestGossip must be(Set(third)) } diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala index 34f8605af1..14f48bfbab 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala @@ -9,7 +9,7 @@ import akka.remote.testkit.MultiNodeSpec import akka.testkit._ import com.typesafe.config.ConfigFactory import akka.actor.Address -import akka.remote.testconductor.{RoleName, Direction} +import akka.remote.testconductor.{ RoleName, Direction } import akka.util.duration._ object UnreachableNodeRejoinsClusterMultiJvmSpec extends MultiNodeConfig { @@ -26,7 +26,6 @@ class UnreachableNodeRejoinsClusterWithFailureDetectorPuppetMultiJvmNode2 extend class UnreachableNodeRejoinsClusterWithFailureDetectorPuppetMultiJvmNode3 extends UnreachableNodeRejoinsClusterSpec with FailureDetectorPuppetStrategy class UnreachableNodeRejoinsClusterWithFailureDetectorPuppetMultiJvmNode4 extends UnreachableNodeRejoinsClusterSpec with FailureDetectorPuppetStrategy - class UnreachableNodeRejoinsClusterWithAccrualFailureDetectorMultiJvmNode1 extends UnreachableNodeRejoinsClusterSpec with AccrualFailureDetectorStrategy class UnreachableNodeRejoinsClusterWithAccrualFailureDetectorMultiJvmNode2 extends UnreachableNodeRejoinsClusterSpec with AccrualFailureDetectorStrategy class UnreachableNodeRejoinsClusterWithAccrualFailureDetectorMultiJvmNode3 extends UnreachableNodeRejoinsClusterSpec with AccrualFailureDetectorStrategy @@ -41,7 +40,6 @@ abstract class UnreachableNodeRejoinsClusterSpec roles.filterNot(_ == role) } - lazy val sortedRoles = roles.sorted lazy val master = sortedRoles(0) lazy val victim = sortedRoles(1) @@ -55,14 +53,14 @@ abstract class UnreachableNodeRejoinsClusterSpec "A cluster of " + roles.size + " members" must { "reach initial convergence" taggedAs LongRunningTest in { - awaitClusterUp(roles:_*) + awaitClusterUp(roles: _*) endBarrier } "mark a node as UNREACHABLE when we pull the network" taggedAs LongRunningTest in { runOn(first) { // pull network for victim node from all nodes - allBut(victim).foreach { roleName => + allBut(victim).foreach { roleName ⇒ testConductor.blackhole(victim, roleName, Direction.Both).await } } @@ -74,24 +72,28 @@ abstract class UnreachableNodeRejoinsClusterSpec allButVictim.foreach(markNodeAsUnavailable(_)) within(30 seconds) { // victim becomes all alone - awaitCond({ val gossip = cluster.latestGossip + awaitCond({ + val gossip = cluster.latestGossip gossip.overview.unreachable.size == (roles.size - 1) && gossip.members.size == 1 && - gossip.members.forall(_.status == MemberStatus.Up) }) + gossip.members.forall(_.status == MemberStatus.Up) + }) cluster.latestGossip.overview.unreachable.map(_.address) must be((allButVictim map address).toSet) cluster.convergence.isDefined must be(false) } } - runOn(allButVictim:_*) { + runOn(allButVictim: _*) { markNodeAsUnavailable(victim) within(30 seconds) { // victim becomes unreachable - awaitCond({ val gossip = cluster.latestGossip + awaitCond({ + val gossip = cluster.latestGossip gossip.overview.unreachable.size == 1 && gossip.members.size == (roles.size - 1) && - gossip.members.forall(_.status == MemberStatus.Up) }) - awaitSeenSameState(allButVictim map address:_*) + gossip.members.forall(_.status == MemberStatus.Up) + }) + awaitSeenSameState(allButVictim map address: _*) // still one unreachable cluster.latestGossip.overview.unreachable.size must be(1) cluster.latestGossip.overview.unreachable.head.address must be(node(victim).address) @@ -108,7 +110,7 @@ abstract class UnreachableNodeRejoinsClusterSpec cluster down victim } - runOn(allBut(victim):_*) { + runOn(allBut(victim): _*) { awaitUpConvergence(roles.size - 1, Seq(victim)) } @@ -118,7 +120,7 @@ abstract class UnreachableNodeRejoinsClusterSpec "allow node to REJOIN when the network is plugged back in" taggedAs LongRunningTest in { runOn(first) { // put the network back in - allBut(victim).foreach { roleName => + allBut(victim).foreach { roleName ⇒ testConductor.passThrough(victim, roleName, Direction.Both).await } } diff --git a/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala b/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala index 00af943c27..71504e6b2b 100644 --- a/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala +++ b/akka-cluster/src/test/scala/akka/cluster/ClusterConfigSpec.scala @@ -29,8 +29,8 @@ class ClusterConfigSpec extends AkkaSpec { HeartbeatInterval must be(1 second) LeaderActionsInterval must be(1 second) UnreachableNodesReaperInterval must be(1 second) + PublishStateInterval must be(1 second) JoinTimeout must be(60 seconds) - NrOfGossipDaemons must be(4) AutoJoin must be(true) AutoDown must be(false) UseDispatcher must be(Dispatchers.DefaultDispatcherId) diff --git a/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala b/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala index 68731b89b2..6640605bcd 100644 --- a/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala +++ b/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala @@ -50,6 +50,7 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { // 3 deputy nodes (addresses index 1, 2, 3) override def seedNodes = addresses.slice(1, 4) + /* FIXME This way of mocking is not possible any more... override def selectRandomNode(addresses: IndexedSeq[Address]): Option[Address] = { if (addresses.isEmpty) None else Some(addresses.toSeq(deterministicRandom.getAndIncrement % addresses.size)) @@ -71,24 +72,29 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { else _gossipToDeputyProbablity } + */ + } def memberStatus(address: Address): Option[MemberStatus] = cluster.latestGossip.members.collectFirst { case m if m.address == address ⇒ m.status } before { + /* FIXME cluster._gossipToDeputyProbablity = 0.0 + */ addresses foreach failureDetector.remove deterministicRandom.set(0) } + /* FIXME ignored due to actor refactoring, must be done in other way "A Cluster" must { - "use the address of the remote transport" in { + "use the address of the remote transport" ignore { cluster.selfAddress must be(selfAddress) } - "initially become singleton cluster when joining itself and reach convergence" in { + "initially become singleton cluster when joining itself and reach convergence" ignore { cluster.isSingletonCluster must be(false) // auto-join = off cluster.join(selfAddress) awaitCond(cluster.isSingletonCluster) @@ -96,11 +102,13 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { cluster.latestGossip.members.map(_.address) must be(Set(selfAddress)) memberStatus(selfAddress) must be(Some(MemberStatus.Joining)) cluster.convergence.isDefined must be(true) + /* FIXME cluster.leaderActions() + */ memberStatus(selfAddress) must be(Some(MemberStatus.Up)) } - "accept a joining node" in { + "accept a joining node" ignore { cluster.joining(addresses(1)) cluster.latestGossip.members.map(_.address) must be(Set(selfAddress, addresses(1))) memberStatus(addresses(1)) must be(Some(MemberStatus.Joining)) @@ -108,7 +116,7 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { expectMsg(GossipTo(addresses(1))) } - "accept a few more joining nodes" in { + "accept a few more joining nodes" ignore { for (a ← addresses.drop(2)) { cluster.joining(a) memberStatus(a) must be(Some(MemberStatus.Joining)) @@ -117,12 +125,12 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { cluster.latestGossip.members.map(_.address) must be(addresses.toSet) } - "order members by host and port" in { + "order members by host and port" ignore { // note the importance of using toSeq before map, otherwise it will not preserve the order cluster.latestGossip.members.toSeq.map(_.address) must be(addresses.toSeq) } - "gossip to random live node" in { + "gossip to random live node" ignore { cluster.gossip() cluster.gossip() cluster.gossip() @@ -136,7 +144,7 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { expectNoMsg(1 second) } - "use certain probability for gossiping to deputy node depending on the number of unreachable and live nodes" in { + "use certain probability for gossiping to deputy node depending on the number of unreachable and live nodes" ignore { cluster._gossipToDeputyProbablity = -1.0 // use real impl cluster.gossipToDeputyProbablity(10, 1, 2) must be < (cluster.gossipToDeputyProbablity(9, 1, 2)) cluster.gossipToDeputyProbablity(10, 1, 2) must be < (cluster.gossipToDeputyProbablity(10, 2, 2)) @@ -150,7 +158,7 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { cluster.gossipToDeputyProbablity(3, 7, 4) must be(1.0 plusOrMinus (0.0001)) } - "gossip to duputy node" in { + "gossip to duputy node" ignore { cluster._gossipToDeputyProbablity = 1.0 // always // we have configured 3 deputy nodes (seedNodes) @@ -170,7 +178,7 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { } - "gossip to random deputy node if number of live nodes is less than number of deputy nodes" in { + "gossip to random deputy node if number of live nodes is less than number of deputy nodes" ignore { cluster._gossipToDeputyProbablity = -1.0 // real impl // 0 and 2 still alive val dead = Set(addresses(1), addresses(3), addresses(4), addresses(5)) @@ -190,4 +198,5 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { } } + */ } From fbeb6017cc90f8488c6f43152dfc61e58c686b88 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Wed, 4 Jul 2012 14:39:27 +0200 Subject: [PATCH 18/39] Remove gossip to deputy nodes, see #2310 --- .../src/main/resources/reference.conf | 5 - .../src/main/scala/akka/cluster/Cluster.scala | 34 +--- .../test/scala/akka/cluster/ClusterSpec.scala | 163 ++---------------- akka-docs/cluster/cluster.rst | 33 +--- 4 files changed, 27 insertions(+), 208 deletions(-) diff --git a/akka-cluster/src/main/resources/reference.conf b/akka-cluster/src/main/resources/reference.conf index 3ce4eca363..bcf288dfec 100644 --- a/akka-cluster/src/main/resources/reference.conf +++ b/akka-cluster/src/main/resources/reference.conf @@ -9,8 +9,6 @@ akka { cluster { # Initial contact points of the cluster. Nodes to join at startup if auto-join = on. - # The seed nodes also play the role of deputy nodes (the nodes responsible - # for breaking network partitions). # Comma separated full URIs defined by a string on the form of "akka://system@hostname:port" # Leave as empty if the node should be a singleton cluster. seed-nodes = [] @@ -27,9 +25,6 @@ akka { # network partition. auto-down = off - # the number of deputy nodes (the nodes responsible for breaking network partitions) - nr-of-deputy-nodes = 3 - # how long should the node wait before starting the periodic tasks maintenance tasks? periodic-tasks-initial-delay = 1s diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 54dfd585ce..4d85d9c300 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -1022,7 +1022,7 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac val localUnreachableMembers = localGossip.overview.unreachable.toIndexedSeq val localUnreachableSize = localUnreachableMembers.size - // 1. gossip to a random alive member with preference to a member + // gossip to a random alive member with preference to a member // with older or newer gossip version val nodesWithdifferentView = { val localMemberAddressesSet = localGossip.members map { _.address } @@ -1038,14 +1038,6 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac else gossipToRandomNodeOf(localMemberAddresses) - // 2. gossip to a deputy nodes for facilitating partition healing - val deputies = deputyNodes(localMemberAddresses) - val alreadyGossipedToDeputy = gossipedToAlive.map(deputies.contains(_)).getOrElse(false) - if ((!alreadyGossipedToDeputy || localMembersSize < seedNodes.size) && deputies.nonEmpty) { - val probability = gossipToDeputyProbablity(localMembersSize, localUnreachableSize, seedNodes.size) - if (ThreadLocalRandom.current.nextDouble() < probability) - gossipToRandomNodeOf(deputies) - } } } @@ -1280,12 +1272,6 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac } } - /** - * Gets the addresses of a all the 'deputy' nodes - excluding this node if part of the group. - */ - def deputyNodes(addresses: IndexedSeq[Address]): IndexedSeq[Address] = - addresses filterNot (_ == selfAddress) intersect seedNodes - def seedNodes: IndexedSeq[Address] = cluster.seedNodes def selectRandomNode(addresses: IndexedSeq[Address]): Option[Address] = @@ -1309,15 +1295,6 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac peer } - private[cluster] def gossipToDeputyProbablity(membersSize: Int, unreachableSize: Int, nrOfDeputyNodes: Int): Double = { - if (nrOfDeputyNodes > membersSize) 1.0 - else if (nrOfDeputyNodes == 0) 0.0 - else (membersSize + unreachableSize) match { - case 0 ⇒ 0.0 - case sum ⇒ (nrOfDeputyNodes + unreachableSize).toDouble / sum - } - } - /** * Gossips latest gossip to an address. */ @@ -1399,12 +1376,9 @@ object Cluster extends ExtensionId[Cluster] with ExtensionIdProvider { * and dead members. Periodically i.e. every 1 second this module chooses a random member and initiates a round * of Gossip with it. *

- * During each of these runs the member initiates gossip exchange according to following rules: - *

- *   1) Gossip to random live member (if any)
- *   2) If the member gossiped to at (1) was not deputy, or the number of live members is less than number of deputy list,
- *       gossip to random deputy with certain probability depending on number of unreachable, deputy and live members.
- * 
+ * During each round of gossip exchange it sends Gossip to random node with + * newer or older state information, if any, based on the current gossip overview, + * with some probability. Otherwise Gossip to any random live node. * * Example: * {{{ diff --git a/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala b/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala index 6640605bcd..5812586a3f 100644 --- a/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala +++ b/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala @@ -5,13 +5,14 @@ package akka.cluster import akka.testkit.AkkaSpec +import akka.testkit.ImplicitSender import akka.util.duration._ import akka.util.Duration import akka.actor.ExtendedActorSystem import akka.actor.Address import java.util.concurrent.atomic.AtomicInteger -import org.scalatest.BeforeAndAfter import akka.remote.RemoteActorRefProvider +import InternalClusterAction._ object ClusterSpec { val config = """ @@ -19,6 +20,7 @@ object ClusterSpec { auto-join = off auto-down = off periodic-tasks-initial-delay = 120 seconds // turn off scheduled tasks + publish-state-interval = 0 s # always, when it happens } akka.actor.provider = "akka.remote.RemoteActorRefProvider" akka.remote.netty.port = 0 @@ -29,174 +31,43 @@ object ClusterSpec { } @org.junit.runner.RunWith(classOf[org.scalatest.junit.JUnitRunner]) -class ClusterSpec extends AkkaSpec(ClusterSpec.config) with BeforeAndAfter { +class ClusterSpec extends AkkaSpec(ClusterSpec.config) with ImplicitSender { import ClusterSpec._ val selfAddress = system.asInstanceOf[ExtendedActorSystem].provider.asInstanceOf[RemoteActorRefProvider].transport.address - val addresses = IndexedSeq( - selfAddress, - Address("akka", system.name, selfAddress.host.get, selfAddress.port.get + 1), - Address("akka", system.name, selfAddress.host.get, selfAddress.port.get + 2), - Address("akka", system.name, selfAddress.host.get, selfAddress.port.get + 3), - Address("akka", system.name, selfAddress.host.get, selfAddress.port.get + 4), - Address("akka", system.name, selfAddress.host.get, selfAddress.port.get + 5)) - - val deterministicRandom = new AtomicInteger val failureDetector = new FailureDetectorPuppet(system) - val cluster = new Cluster(system.asInstanceOf[ExtendedActorSystem], failureDetector) { - - // 3 deputy nodes (addresses index 1, 2, 3) - override def seedNodes = addresses.slice(1, 4) - - /* FIXME This way of mocking is not possible any more... - override def selectRandomNode(addresses: IndexedSeq[Address]): Option[Address] = { - if (addresses.isEmpty) None - else Some(addresses.toSeq(deterministicRandom.getAndIncrement % addresses.size)) - } - - override def gossipTo(address: Address): Unit = { - if (address == self.address) { - super.gossipTo(address) - } - // represent the gossip with a message to be used in asserts - testActor ! GossipTo(address) - } - - @volatile - var _gossipToDeputyProbablity = 0.0 - - override def gossipToDeputyProbablity(membersSize: Int, unreachableSize: Int, deputySize: Int): Double = { - if (_gossipToDeputyProbablity < 0.0) super.gossipToDeputyProbablity(membersSize, unreachableSize, deputySize) - else _gossipToDeputyProbablity - } - - */ + val cluster = new Cluster(system.asInstanceOf[ExtendedActorSystem], failureDetector) + def leaderActions(): Unit = { + cluster.clusterCore ! LeaderActionsTick + awaitPing() } - def memberStatus(address: Address): Option[MemberStatus] = - cluster.latestGossip.members.collectFirst { case m if m.address == address ⇒ m.status } - - before { - /* FIXME - cluster._gossipToDeputyProbablity = 0.0 - */ - addresses foreach failureDetector.remove - deterministicRandom.set(0) + def awaitPing(): Unit = { + val ping = Ping() + cluster.clusterCore ! ping + expectMsgPF() { case pong @ Pong(`ping`, _) ⇒ pong } } - /* FIXME ignored due to actor refactoring, must be done in other way "A Cluster" must { - "use the address of the remote transport" ignore { + "use the address of the remote transport" in { cluster.selfAddress must be(selfAddress) } - "initially become singleton cluster when joining itself and reach convergence" ignore { + "initially become singleton cluster when joining itself and reach convergence" in { cluster.isSingletonCluster must be(false) // auto-join = off cluster.join(selfAddress) awaitCond(cluster.isSingletonCluster) cluster.self.address must be(selfAddress) cluster.latestGossip.members.map(_.address) must be(Set(selfAddress)) - memberStatus(selfAddress) must be(Some(MemberStatus.Joining)) + cluster.status must be(MemberStatus.Joining) cluster.convergence.isDefined must be(true) - /* FIXME - cluster.leaderActions() - */ - memberStatus(selfAddress) must be(Some(MemberStatus.Up)) + leaderActions() + cluster.status must be(MemberStatus.Up) } - "accept a joining node" ignore { - cluster.joining(addresses(1)) - cluster.latestGossip.members.map(_.address) must be(Set(selfAddress, addresses(1))) - memberStatus(addresses(1)) must be(Some(MemberStatus.Joining)) - cluster.convergence.isDefined must be(false) - expectMsg(GossipTo(addresses(1))) - } - - "accept a few more joining nodes" ignore { - for (a ← addresses.drop(2)) { - cluster.joining(a) - memberStatus(a) must be(Some(MemberStatus.Joining)) - expectMsg(GossipTo(a)) - } - cluster.latestGossip.members.map(_.address) must be(addresses.toSet) - } - - "order members by host and port" ignore { - // note the importance of using toSeq before map, otherwise it will not preserve the order - cluster.latestGossip.members.toSeq.map(_.address) must be(addresses.toSeq) - } - - "gossip to random live node" ignore { - cluster.gossip() - cluster.gossip() - cluster.gossip() - cluster.gossip() - - expectMsg(GossipTo(addresses(1))) - expectMsg(GossipTo(addresses(2))) - expectMsg(GossipTo(addresses(3))) - expectMsg(GossipTo(addresses(4))) - - expectNoMsg(1 second) - } - - "use certain probability for gossiping to deputy node depending on the number of unreachable and live nodes" ignore { - cluster._gossipToDeputyProbablity = -1.0 // use real impl - cluster.gossipToDeputyProbablity(10, 1, 2) must be < (cluster.gossipToDeputyProbablity(9, 1, 2)) - cluster.gossipToDeputyProbablity(10, 1, 2) must be < (cluster.gossipToDeputyProbablity(10, 2, 2)) - cluster.gossipToDeputyProbablity(10, 1, 2) must be < (cluster.gossipToDeputyProbablity(10, 2, 3)) - cluster.gossipToDeputyProbablity(10, 5, 5) must be < (cluster.gossipToDeputyProbablity(10, 9, 5)) - cluster.gossipToDeputyProbablity(0, 10, 0) must be <= (1.0) - cluster.gossipToDeputyProbablity(1, 10, 1) must be <= (1.0) - cluster.gossipToDeputyProbablity(10, 0, 0) must be(0.0 plusOrMinus (0.0001)) - cluster.gossipToDeputyProbablity(0, 0, 0) must be(0.0 plusOrMinus (0.0001)) - cluster.gossipToDeputyProbablity(4, 0, 4) must be(1.0 plusOrMinus (0.0001)) - cluster.gossipToDeputyProbablity(3, 7, 4) must be(1.0 plusOrMinus (0.0001)) - } - - "gossip to duputy node" ignore { - cluster._gossipToDeputyProbablity = 1.0 // always - - // we have configured 3 deputy nodes (seedNodes) - cluster.gossip() // 1 is deputy - cluster.gossip() // 2 is deputy - cluster.gossip() // 3 is deputy - cluster.gossip() // 4 is not deputy, and therefore a deputy is also used - - expectMsg(GossipTo(addresses(1))) - expectMsg(GossipTo(addresses(2))) - expectMsg(GossipTo(addresses(3))) - expectMsg(GossipTo(addresses(4))) - // and the extra gossip to deputy - expectMsgAnyOf(GossipTo(addresses(1)), GossipTo(addresses(2)), GossipTo(addresses(3))) - - expectNoMsg(1 second) - - } - - "gossip to random deputy node if number of live nodes is less than number of deputy nodes" ignore { - cluster._gossipToDeputyProbablity = -1.0 // real impl - // 0 and 2 still alive - val dead = Set(addresses(1), addresses(3), addresses(4), addresses(5)) - dead foreach failureDetector.markNodeAsUnavailable - - cluster.reapUnreachableMembers() - cluster.latestGossip.overview.unreachable.map(_.address) must be(dead) - - for (n ← 1 to 20) { - cluster.gossip() - expectMsg(GossipTo(addresses(2))) // the only available - // and always to one of the 3 deputies - expectMsgAnyOf(GossipTo(addresses(1)), GossipTo(addresses(2)), GossipTo(addresses(3))) - } - - expectNoMsg(1 second) - - } } - */ } diff --git a/akka-docs/cluster/cluster.rst b/akka-docs/cluster/cluster.rst index cbad3ef690..040a2fef9f 100644 --- a/akka-docs/cluster/cluster.rst +++ b/akka-docs/cluster/cluster.rst @@ -57,9 +57,6 @@ These terms are used throughout the documentation. A single node in the cluster that acts as the leader. Managing cluster convergence, partitions, fail-over, rebalancing etc. -**deputy nodes** - A set of nodes responsible for breaking logical partitions. - Membership ========== @@ -192,16 +189,6 @@ then sends join command to the one that answers first. It is possible to turn off automatic join. -Deputy Nodes -^^^^^^^^^^^^ - -The deputy nodes are the live members of the configured seed nodes. -It is preferred to use deputy nodes in different racks/data centers. - -The nodes defined as ``deputy`` nodes are just regular member nodes whose only -"special role" is to help breaking logical partitions as seen in the gossip -algorithm defined below. - Gossip Protocol ^^^^^^^^^^^^^^^ @@ -219,26 +206,18 @@ nodes involved in a gossip exchange. Periodically, the default is every 1 second, each node chooses another random node to initiate a round of gossip with. The choice of node is random but can -also include extra gossiping for ``deputy`` nodes, and nodes with -either newer or older state versions. +also include extra gossiping nodes with either newer or older state versions. The gossip overview contains the current state version for all nodes and also a list of unreachable nodes. Whenever a node receives a gossip overview it updates the `Failure Detector`_ with the liveness information. -The nodes defined as ``deputy`` nodes are just regular member nodes whose only -"special role" is to function as contact points in the cluster and to help -breaking logical partitions as seen in the gossip algorithm defined below. +The nodes defined as ``seed`` nodes are just regular member nodes whose only +"special role" is to function as contact points in the cluster. -During each round of gossip exchange the following process is used: - -1. Gossip to random node with newer or older state information, if any, based on the - current gossip overview, with some probability. Otherwise Gossip to any random - live node. - -2. If the node gossiped to at (1) was not a ``deputy`` node, or the number of live - nodes is less than number of ``deputy`` nodes, gossip to random ``deputy`` node with - certain probability depending on number of unreachable, ``deputy``, and live nodes. +During each round of gossip exchange it sends Gossip to random node with +newer or older state information, if any, based on the current gossip overview, +with some probability. Otherwise Gossip to any random live node. The gossiper only sends the gossip overview to the chosen node. The recipient of the gossip can use the gossip overview to determine whether: From f89285a19c917238a5706251d37d63754aaa6938 Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Wed, 4 Jul 2012 15:33:39 +0200 Subject: [PATCH 19/39] Rewriting docs for 0mq Connect --- akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala b/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala index c5d5919fb7..51f92bbbe8 100644 --- a/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala +++ b/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala @@ -48,7 +48,7 @@ sealed trait SocketOptionQuery extends Request /** * This socket should be a client socket and connect to the specified endpoint * - * @param endpoint an uri like tcp://127.0.0.1.5432 + * @param endpoint URI (ex. tcp://127.0.0.1:5432) */ case class Connect(endpoint: String) extends SocketConnectOption From 6e9f6b12dcdfa28a9087d7166b30e72e843231f7 Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Wed, 4 Jul 2012 15:37:10 +0200 Subject: [PATCH 20/39] Shall not do that --- .../src/main/scala/akka/zeromq/SocketOption.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala b/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala index 51f92bbbe8..f5c782e849 100644 --- a/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala +++ b/akka-zeromq/src/main/scala/akka/zeromq/SocketOption.scala @@ -162,14 +162,14 @@ case class PollTimeoutDuration(duration: Duration = 100 millis) extends SocketMe case class Bind(endpoint: String) extends SocketConnectOption /** - * The [[akka.zeromq.Subscribe]] option shall establish a new message filter on a [[akka.zeromq.SocketType.Pub]] socket. - * Newly created [[akka.zeromq.SocketType.Sub]] sockets shall filter out all incoming messages, + * The [[akka.zeromq.Subscribe]] option establishes a new message filter on a [[akka.zeromq.SocketType.Pub]] socket. + * Newly created [[akka.zeromq.SocketType.Sub]] sockets filter out all incoming messages, * therefore you should send this option to establish an initial message filter. * - * An empty payload of length zero shall subscribe to all incoming messages. - * A non-empty payload shall subscribe to all messages beginning with the specified prefix. + * An empty payload of length zero will subscribe to all incoming messages. + * A non-empty payload will subscribe to all messages beginning with the specified prefix. * Multiple filters may be attached to a single [[akka.zeromq.SocketType.Sub]] socket, - * in which case a message shall be accepted if it matches at least one filter. + * in which case a message will be accepted if it matches at least one filter. * * @param payload the topic to subscribe to */ From 9368475512b969a76bc798034f93e24fbe78342c Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Wed, 4 Jul 2012 16:02:13 +0200 Subject: [PATCH 21/39] Deleted NodeLeavingSpec, see #2289 * Because it was also time sensitive and already covered by NodeLeavingAndExitingSpec --- .../scala/akka/cluster/NodeLeavingSpec.scala | 55 ------------------- 1 file changed, 55 deletions(-) delete mode 100644 akka-cluster/src/multi-jvm/scala/akka/cluster/NodeLeavingSpec.scala diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeLeavingSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeLeavingSpec.scala deleted file mode 100644 index 9ece38aae8..0000000000 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeLeavingSpec.scala +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Copyright (C) 2009-2012 Typesafe Inc. - */ -package akka.cluster - -import scala.collection.immutable.SortedSet -import com.typesafe.config.ConfigFactory -import akka.remote.testkit.MultiNodeConfig -import akka.remote.testkit.MultiNodeSpec -import akka.testkit._ - -object NodeLeavingMultiJvmSpec extends MultiNodeConfig { - val first = role("first") - val second = role("second") - val third = role("third") - - commonConfig( - debugConfig(on = false) - .withFallback(ConfigFactory.parseString("akka.cluster.unreachable-nodes-reaper-frequency = 30 s")) - .withFallback(MultiNodeClusterSpec.clusterConfig)) -} - -class NodeLeavingMultiJvmNode1 extends NodeLeavingSpec with FailureDetectorPuppetStrategy -class NodeLeavingMultiJvmNode2 extends NodeLeavingSpec with FailureDetectorPuppetStrategy -class NodeLeavingMultiJvmNode3 extends NodeLeavingSpec with FailureDetectorPuppetStrategy - -abstract class NodeLeavingSpec - extends MultiNodeSpec(NodeLeavingMultiJvmSpec) - with MultiNodeClusterSpec { - - import NodeLeavingMultiJvmSpec._ - - "A node that is LEAVING a non-singleton cluster" must { - - "be marked as LEAVING in the converged membership table" taggedAs LongRunningTest in { - - awaitClusterUp(first, second, third) - - runOn(first) { - cluster.leave(second) - } - enterBarrier("second-left") - - runOn(first, third) { - awaitCond(cluster.latestGossip.members.exists(_.status == MemberStatus.Leaving)) - - val hasLeft = cluster.latestGossip.members.find(_.status == MemberStatus.Leaving) - hasLeft must be('defined) - hasLeft.get.address must be(address(second)) - } - - enterBarrier("finished") - } - } -} From c1d12550a4e79b6f5eb6ae3114d4fc0d6ba366bc Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Wed, 4 Jul 2012 16:09:01 +0200 Subject: [PATCH 22/39] Fix bugs, see #2311 --- akka-cluster/src/main/scala/akka/cluster/Cluster.scala | 9 ++++++--- .../src/multi-jvm/scala/akka/cluster/NodeUpSpec.scala | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 4d85d9c300..0bf7ebeba1 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -795,7 +795,6 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac latestGossip = seenVersionedGossip log.info("Cluster Node [{}] - Marked address [{}] as LEAVING", selfAddress, address) - publishState() notifyListeners(localGossip) } } @@ -819,7 +818,11 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac */ def removing(address: Address): Unit = { log.info("Cluster Node [{}] - Node has been REMOVED by the leader - shutting down...", selfAddress) - publishState() + val localGossip = latestGossip + // just cleaning up the gossip state + latestGossip = Gossip() + // make sure the final (removed) state is always published + notifyListeners(localGossip) cluster.shutdown() } @@ -1309,7 +1312,7 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac } def notifyListeners(oldGossip: Gossip): Unit = { - if (PublishStateInterval == Duration.Zero) publishState + if (PublishStateInterval == Duration.Zero) publishState() val oldMembersStatus = oldGossip.members.map(m ⇒ (m.address, m.status)) val newMembersStatus = latestGossip.members.map(m ⇒ (m.address, m.status)) diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeUpSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeUpSpec.scala index 3da6b2715a..f951da0801 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeUpSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeUpSpec.scala @@ -38,7 +38,7 @@ abstract class NodeUpSpec "be unaffected when joining again" taggedAs LongRunningTest in { - val unexpected = new AtomicReference[SortedSet[Member]] + val unexpected = new AtomicReference[SortedSet[Member]](SortedSet.empty) cluster.registerListener(new MembershipChangeListener { def notify(members: SortedSet[Member]) { if (members.size != 2 || members.exists(_.status != MemberStatus.Up)) @@ -55,7 +55,7 @@ abstract class NodeUpSpec // let it run for a while to make sure that nothing bad happens for (n ← 1 to 20) { 100.millis.dilated.sleep() - unexpected.get must be(null) + unexpected.get must be(SortedSet.empty) cluster.latestGossip.members.forall(_.status == MemberStatus.Up) must be(true) } From 05015536b38f1c08095a756d3ec9821147b5080f Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Wed, 4 Jul 2012 17:03:59 +0200 Subject: [PATCH 23/39] Add missing auto-down=on, due to changed default (lost in merge) --- .../src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala index aefc9762a8..08f7ca10fa 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/LargeClusterSpec.scala @@ -36,6 +36,7 @@ object LargeClusterMultiJvmSpec extends MultiNodeConfig { akka.cluster { gossip-interval = 500 ms auto-join = off + auto-down = on failure-detector.acceptable-heartbeat-pause = 10s publish-state-interval = 0 s # always, when it happens } From 1968c9e6c231d57831f0f1fb798a3d2d416f0160 Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Wed, 4 Jul 2012 18:18:57 +0200 Subject: [PATCH 24/39] Aesthetics change in OSGi sauce --- .../akka/osgi/OsgiActorSystemFactory.scala | 12 ++---- .../impl/BundleDelegatingClassLoader.scala | 40 ++++++------------- 2 files changed, 16 insertions(+), 36 deletions(-) diff --git a/akka-osgi/src/main/scala/akka/osgi/OsgiActorSystemFactory.scala b/akka-osgi/src/main/scala/akka/osgi/OsgiActorSystemFactory.scala index ae36406a60..6acb766bfb 100644 --- a/akka-osgi/src/main/scala/akka/osgi/OsgiActorSystemFactory.scala +++ b/akka-osgi/src/main/scala/akka/osgi/OsgiActorSystemFactory.scala @@ -14,7 +14,7 @@ class OsgiActorSystemFactory(val context: BundleContext) { /* * Classloader that delegates to the bundle for which the factory is creating an ActorSystem */ - private val classloader = BundleDelegatingClassLoader.createFor(context) + private val classloader = BundleDelegatingClassLoader(context) /** * Creates the [[akka.actor.ActorSystem]], using the name specified @@ -33,25 +33,21 @@ class OsgiActorSystemFactory(val context: BundleContext) { * Strategy method to create the Config for the ActorSystem, ensuring that the default/reference configuration is * loaded from the akka-actor bundle. */ - def actorSystemConfig(context: BundleContext): Config = { - val reference = ConfigFactory.defaultReference(classOf[ActorSystem].getClassLoader) - ConfigFactory.load(classloader).withFallback(reference) - } + def actorSystemConfig(context: BundleContext): Config = + ConfigFactory.load(classloader).withFallback(ConfigFactory.defaultReference(classOf[ActorSystem].getClassLoader)) /** * Determine the name for the [[akka.actor.ActorSystem]] * Returns a default value of `bundle--ActorSystem` is no name is being specified */ def actorSystemName(name: Option[String]): String = - name.getOrElse("bundle-%s-ActorSystem".format(context.getBundle().getBundleId)) + name.getOrElse("bundle-%s-ActorSystem".format(context.getBundle.getBundleId)) } object OsgiActorSystemFactory { - /* * Create an [[OsgiActorSystemFactory]] instance to set up Akka in an OSGi environment */ def apply(context: BundleContext): OsgiActorSystemFactory = new OsgiActorSystemFactory(context) - } diff --git a/akka-osgi/src/main/scala/akka/osgi/impl/BundleDelegatingClassLoader.scala b/akka-osgi/src/main/scala/akka/osgi/impl/BundleDelegatingClassLoader.scala index 08dee0344e..5d89e234d9 100644 --- a/akka-osgi/src/main/scala/akka/osgi/impl/BundleDelegatingClassLoader.scala +++ b/akka-osgi/src/main/scala/akka/osgi/impl/BundleDelegatingClassLoader.scala @@ -13,7 +13,7 @@ object BundleDelegatingClassLoader { /* * Create a bundle delegating classloader for the bundle context's bundle */ - def createFor(context: BundleContext) = new BundleDelegatingClassLoader(context.getBundle) + def apply(context: BundleContext): BundleDelegatingClassLoader = new BundleDelegatingClassLoader(context.getBundle) } @@ -27,13 +27,11 @@ class BundleDelegatingClassLoader(bundle: Bundle, classLoader: Option[ClassLoade protected override def findClass(name: String): Class[_] = bundle.loadClass(name) - protected override def findResource(name: String): URL = { - val resource = bundle.getResource(name) - classLoader match { - case Some(loader) if resource == null ⇒ loader.getResource(name) - case _ ⇒ resource + protected override def findResource(name: String): URL = + bundle.getResource(name) match { + case null if classLoader.isDefined ⇒ classLoader.get.getResource(name) + case result ⇒ result } - } @SuppressWarnings(Array("unchecked", "rawtypes")) protected override def findResources(name: String): Enumeration[URL] = @@ -41,32 +39,18 @@ class BundleDelegatingClassLoader(bundle: Bundle, classLoader: Option[ClassLoade protected override def loadClass(name: String, resolve: Boolean): Class[_] = { val clazz = try { - findClass(name) + try findClass(name) catch { case _: ClassNotFoundException if classLoader.isDefined ⇒ classLoader.get.loadClass(name) } // First fall back to secondary loader } catch { - case cnfe: ClassNotFoundException ⇒ { - classLoader match { - case Some(loader) ⇒ loadClass(name, loader) - case None ⇒ rethrowClassNotFoundException(name, cnfe) - } - } + case cnfe: ClassNotFoundException ⇒ + throw new ClassNotFoundException("%s from bundle %s (%s)".format(name, bundle.getBundleId, bundle.getSymbolicName), cnfe) // IF we have no secondary loader or that failed as well, wrap and rethrow } - if (resolve) { + + if (resolve) resolveClass(clazz) - } + clazz } - private def loadClass(name: String, classLoader: ClassLoader) = - try { - classLoader.loadClass(name) - } catch { - case cnfe: ClassNotFoundException ⇒ rethrowClassNotFoundException(name, cnfe) - } - - private def rethrowClassNotFoundException(name: String, cnfe: ClassNotFoundException): Nothing = - throw new ClassNotFoundException(name + " from bundle " + bundle.getBundleId + " (" + bundle.getSymbolicName + ")", cnfe) - - override def toString: String = String.format("BundleDelegatingClassLoader(%s)", bundle) - + override val toString: String = "BundleDelegatingClassLoader(%s)".format(bundle.getBundleId) } From 05336c35ee5fc183f2dc169716cf26212c75efbb Mon Sep 17 00:00:00 2001 From: Roland Date: Wed, 4 Jul 2012 19:04:22 +0200 Subject: [PATCH 25/39] fix SupervisorSpec race (within the test code) --- .../src/test/scala/akka/actor/SupervisorSpec.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/akka-actor-tests/src/test/scala/akka/actor/SupervisorSpec.scala b/akka-actor-tests/src/test/scala/akka/actor/SupervisorSpec.scala index 3db5b5b5dc..2f4863711f 100644 --- a/akka-actor-tests/src/test/scala/akka/actor/SupervisorSpec.scala +++ b/akka-actor-tests/src/test/scala/akka/actor/SupervisorSpec.scala @@ -374,8 +374,8 @@ class SupervisorSpec extends AkkaSpec with BeforeAndAfterEach with ImplicitSende val child = context.watch(context.actorOf(Props(new Actor { override def postRestart(reason: Throwable): Unit = testActor ! "child restarted" def receive = { - case "die" ⇒ throw new IllegalStateException("OHNOES") - case "test" ⇒ sender ! "child green" + case l: TestLatch ⇒ Await.ready(l, 5 seconds); throw new IllegalStateException("OHNOES") + case "test" ⇒ sender ! "child green" } }), "child")) @@ -383,14 +383,18 @@ class SupervisorSpec extends AkkaSpec with BeforeAndAfterEach with ImplicitSende def receive = { case t @ Terminated(`child`) ⇒ testActor ! "child terminated" - case "die" ⇒ child ! "die" + case l: TestLatch ⇒ child ! l case "test" ⇒ sender ! "green" case "testchild" ⇒ child forward "test" } })) - parent ! "die" + val latch = TestLatch() + parent ! latch parent ! "testchild" + EventFilter[IllegalStateException]("OHNOES", occurrences = 2) intercept { + latch.countDown() + } expectMsg("parent restarted") expectMsg("child terminated") parent ! "test" From e1c085161d5504b8219749ef9e07c6d75030cd2c Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Wed, 4 Jul 2012 19:17:01 +0200 Subject: [PATCH 26/39] Cleaning up SupervisorMiscSpec --- .../scala/akka/actor/SupervisorMiscSpec.scala | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/akka-actor-tests/src/test/scala/akka/actor/SupervisorMiscSpec.scala b/akka-actor-tests/src/test/scala/akka/actor/SupervisorMiscSpec.scala index 197e749d2e..ec0c51e9ae 100644 --- a/akka-actor-tests/src/test/scala/akka/actor/SupervisorMiscSpec.scala +++ b/akka-actor-tests/src/test/scala/akka/actor/SupervisorMiscSpec.scala @@ -55,19 +55,19 @@ class SupervisorMiscSpec extends AkkaSpec(SupervisorMiscSpec.config) with Defaul actor4 ! Kill countDownLatch.await(10, TimeUnit.SECONDS) - assert(Await.result(actor1 ? "status", timeout.duration) == "OK", "actor1 is shutdown") - assert(Await.result(actor2 ? "status", timeout.duration) == "OK", "actor2 is shutdown") - assert(Await.result(actor3 ? "status", timeout.duration) == "OK", "actor3 is shutdown") - assert(Await.result(actor4 ? "status", timeout.duration) == "OK", "actor4 is shutdown") + + Seq("actor1" -> actor1, "actor2" -> actor2, "actor3" -> actor3, "actor4" -> actor4) map { + case (id, ref) ⇒ (id, ref ? "status") + } foreach { + case (id, f) ⇒ (id, Await.result(f, timeout.duration)) must be === ((id, "OK")) + } } } "be able to create named children in its constructor" in { val a = system.actorOf(Props(new Actor { context.actorOf(Props.empty, "bob") - def receive = { - case x: Exception ⇒ throw x - } + def receive = { case x: Exception ⇒ throw x } override def preStart(): Unit = testActor ! "preStart" })) val m = "weird message" @@ -123,20 +123,14 @@ class SupervisorMiscSpec extends AkkaSpec(SupervisorMiscSpec.config) with Defaul "be able to create a similar kid in the fault handling strategy" in { val parent = system.actorOf(Props(new Actor { - override val supervisorStrategy = new OneForOneStrategy()(SupervisorStrategy.defaultStrategy.decider) { override def handleChildTerminated(context: ActorContext, child: ActorRef, children: Iterable[ActorRef]): Unit = { val newKid = context.actorOf(Props.empty, child.path.name) - testActor ! { - if ((newKid ne child) && newKid.path == child.path) "green" - else "red" - } + testActor ! { if ((newKid ne child) && newKid.path == child.path) "green" else "red" } } } - def receive = { - case "engage" ⇒ context.stop(context.actorOf(Props.empty, "Robert")) - } + def receive = { case "engage" ⇒ context.stop(context.actorOf(Props.empty, "Robert")) } })) parent ! "engage" expectMsg("green") From 6aa5f93f6ed6ea98b10288cbf5a08384baf1e146 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Thu, 5 Jul 2012 07:56:40 +0200 Subject: [PATCH 27/39] Make Cluster ready for use before constructor returns, #2311 --- .../src/main/scala/akka/cluster/Cluster.scala | 85 +++++++++---------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 0bf7ebeba1..43cb115b4d 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -57,12 +57,6 @@ sealed trait ClusterMessage extends Serializable */ object ClusterUserAction { - /** - * Command to initiate join another node (represented by 'address'). - * Join will be sent to the other node. - */ - case class JoinTo(address: Address) extends ClusterMessage - /** * Command to join the cluster. Sent when a node (represented by 'address') * wants to join another node (the receiver). @@ -84,7 +78,13 @@ object ClusterUserAction { /** * INTERNAL API */ -object InternalClusterAction { +private[cluster] object InternalClusterAction { + + /** + * Command to initiate join another node (represented by 'address'). + * Join will be sent to the other node. + */ + case class JoinTo(address: Address) extends ClusterMessage /** * Start message of the process to join one of the seed nodes. @@ -134,24 +134,22 @@ object InternalClusterAction { } /** + * INTERNAL API. + * * Cluster commands sent by the LEADER. */ -object ClusterLeaderAction { +private[cluster] object ClusterLeaderAction { /** - * INTERNAL API. - * * Command to mark a node to be removed from the cluster immediately. * Can only be sent by the leader. */ - private[cluster] case class Exit(address: Address) extends ClusterMessage + case class Exit(address: Address) extends ClusterMessage /** - * INTERNAL API. - * * Command to remove a node from the cluster immediately. */ - private[cluster] case class Remove(address: Address) extends ClusterMessage + case class Remove(address: Address) extends ClusterMessage } /** @@ -679,25 +677,25 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac } def receive = { - case JoinSeedNode ⇒ joinSeedNode() - case InitJoin ⇒ initJoin() - case InitJoinAck(address) ⇒ join(address) - case Failure(e: AskTimeoutException) ⇒ joinSeedNodeTimeout() - case ClusterUserAction.JoinTo(address) ⇒ join(address) - case ClusterUserAction.Join(address) ⇒ joining(address) - case ClusterUserAction.Down(address) ⇒ downing(address) - case ClusterUserAction.Leave(address) ⇒ leaving(address) - case Exit(address) ⇒ exiting(address) - case Remove(address) ⇒ removing(address) - case msg: GossipEnvelope ⇒ receiveGossip(msg) - case msg: GossipMergeConflict ⇒ receiveGossipMerge(msg) - case GossipTick ⇒ gossip() - case HeartbeatTick ⇒ heartbeat() - case ReapUnreachableTick ⇒ reapUnreachableMembers() - case LeaderActionsTick ⇒ leaderActions() - case SendGossipTo(address) ⇒ gossipTo(address) - case PublishStateTick ⇒ publishState() - case p: Ping ⇒ ping(p) + case JoinSeedNode ⇒ joinSeedNode() + case InitJoin ⇒ initJoin() + case InitJoinAck(address) ⇒ join(address) + case Failure(e: AskTimeoutException) ⇒ joinSeedNodeTimeout() + case JoinTo(address) ⇒ join(address) + case ClusterUserAction.Join(address) ⇒ joining(address) + case ClusterUserAction.Down(address) ⇒ downing(address) + case ClusterUserAction.Leave(address) ⇒ leaving(address) + case Exit(address) ⇒ exiting(address) + case Remove(address) ⇒ removing(address) + case msg: GossipEnvelope ⇒ receiveGossip(msg) + case msg: GossipMergeConflict ⇒ receiveGossipMerge(msg) + case GossipTick ⇒ gossip() + case HeartbeatTick ⇒ heartbeat() + case ReapUnreachableTick ⇒ reapUnreachableMembers() + case LeaderActionsTick ⇒ leaderActions() + case SendGossipTo(address) ⇒ gossipTo(address) + case PublishStateTick ⇒ publishState() + case p: Ping ⇒ ping(p) } @@ -1341,10 +1339,10 @@ private[cluster] final class ClusterDaemonSupervisor(cluster: Cluster) extends A val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(cluster)). withDispatcher(configuredDispatcher), name = "heartbeat") - def receive = Actor.emptyBehavior + def receive = { + case InternalClusterAction.GetClusterCoreRef ⇒ sender ! core + } - override def unhandled(unknown: Any): Unit = log.error("[{}] can not respond to messages - received [{}]", - self.path, unknown) } /** @@ -1474,20 +1472,17 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) // create supervisor for daemons under path "/system/cluster" private val clusterDaemons: ActorRef = { - implicit val timeout = Timeout(remoteSettings.RemoteSystemDaemonAckTimeout) - val createChild = CreateChild(Props(new ClusterDaemonSupervisor(this)). + system.asInstanceOf[ActorSystemImpl].systemActorOf(Props(new ClusterDaemonSupervisor(this)). withDispatcher(UseDispatcher), name = "cluster") - Await.result(system.systemGuardian ? createChild, timeout.duration) match { - case a: ActorRef ⇒ a - case e: Exception ⇒ throw e - } } /** * INTERNAL API */ - private[cluster] def clusterCore: ActorRef = - system.actorFor(clusterDaemons.path / "core") + private[cluster] val clusterCore: ActorRef = { + implicit val timeout = system.settings.CreationTimeout + Await.result((clusterDaemons ? InternalClusterAction.GetClusterCoreRef).mapTo[ActorRef], timeout.duration) + } system.registerOnTermination(shutdown()) @@ -1584,7 +1579,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) * A 'Join(thisNodeAddress)' command is sent to the node to join. */ def join(address: Address): Unit = - clusterCore ! ClusterUserAction.JoinTo(address) + clusterCore ! InternalClusterAction.JoinTo(address) /** * Send command to issue state transition to LEAVING for the node specified by 'address'. From 37826533d3a7466141df0c58a855ad386ad6f237 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Thu, 5 Jul 2012 09:50:58 +0200 Subject: [PATCH 28/39] Fix race in UnreachableNodeRejoinsClusterSpec --- .../main/scala/akka/cluster/AccrualFailureDetector.scala | 5 +++-- akka-cluster/src/main/scala/akka/cluster/Cluster.scala | 9 +++++---- .../akka/cluster/UnreachableNodeRejoinsClusterSpec.scala | 5 +++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/AccrualFailureDetector.scala b/akka-cluster/src/main/scala/akka/cluster/AccrualFailureDetector.scala index f1c761dec7..76c9595a59 100644 --- a/akka-cluster/src/main/scala/akka/cluster/AccrualFailureDetector.scala +++ b/akka-cluster/src/main/scala/akka/cluster/AccrualFailureDetector.scala @@ -167,8 +167,9 @@ class AccrualFailureDetector( val φ = phi(timeDiff, mean + acceptableHeartbeatPauseMillis, stdDeviation) // FIXME change to debug log level, when failure detector is stable - if (φ > 1.0) log.info("Phi value [{}] for connection [{}], after [{} ms], based on [{}]", - φ, connection, timeDiff, "N(" + mean + ", " + stdDeviation + ")") + if (φ > 1.0 && timeDiff < (acceptableHeartbeatPauseMillis + 5000)) + log.info("Phi value [{}] for connection [{}], after [{} ms], based on [{}]", + φ, connection, timeDiff, "N(" + mean + ", " + stdDeviation + ")") φ } diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 43cb115b4d..86caf231c3 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -569,7 +569,6 @@ private[cluster] final class ClusterHeartbeatSenderWorker( def receive = { case SendHeartbeat(heartbeatMsg, _, deadline) ⇒ - log.debug("Cluster Node [{}] - Heartbeat to [{}]", heartbeatMsg.from, toRef) if (!deadline.isOverdue) { // the CircuitBreaker will measure elapsed time and open if too many long calls try breaker.withSyncCircuitBreaker { @@ -977,8 +976,8 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac joinInProgress = newJoinInProgress // for all new joining nodes we remove them from the failure detector - (latestGossip.members -- localGossip.members).filter(_.status == Joining).foreach { - case node ⇒ cluster.failureDetector.remove(node.address) + (latestGossip.members -- localGossip.members).filter(_.status == Joining).foreach { node ⇒ + cluster.failureDetector.remove(node.address) } log.debug("Cluster Node [{}] - Receiving gossip from [{}]", selfAddress, from) @@ -1250,7 +1249,9 @@ private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with Ac val localMembers = localGossip.members val localUnreachableMembers = localGossip.overview.unreachable - val newlyDetectedUnreachableMembers = localMembers filterNot { member ⇒ cluster.failureDetector.isAvailable(member.address) } + val newlyDetectedUnreachableMembers = localMembers filterNot { member ⇒ + member.address == selfAddress || cluster.failureDetector.isAvailable(member.address) + } if (newlyDetectedUnreachableMembers.nonEmpty) { diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala index 14f48bfbab..65a36080ff 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/UnreachableNodeRejoinsClusterSpec.scala @@ -58,6 +58,11 @@ abstract class UnreachableNodeRejoinsClusterSpec } "mark a node as UNREACHABLE when we pull the network" taggedAs LongRunningTest in { + // let them send at least one heartbeat to each other after the gossip convergence + // because for new joining nodes we remove them from the failure detector when + // receive gossip + 2.seconds.dilated.sleep + runOn(first) { // pull network for victim node from all nodes allBut(victim).foreach { roleName ⇒ From 99c4b30884203dd504e6418c9637d4823a52948f Mon Sep 17 00:00:00 2001 From: Roland Date: Thu, 5 Jul 2012 10:29:34 +0200 Subject: [PATCH 29/39] fix JavaTestKit docs (one warning too many) --- akka-docs/java/testing.rst | 2 +- akka-docs/scala/testing.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/akka-docs/java/testing.rst b/akka-docs/java/testing.rst index bab663b355..6a3b10d91c 100644 --- a/akka-docs/java/testing.rst +++ b/akka-docs/java/testing.rst @@ -109,7 +109,7 @@ One more special aspect which is overridden for single-threaded tests is the :meth:`receiveTimeout`, as including that would entail asynchronous queuing of :obj:`ReceiveTimeout` messages, violating the synchronous contract. -.. warning:: +.. note:: To summarize: :class:`TestActorRef` overwrites two fields: it sets the dispatcher to :obj:`CallingThreadDispatcher.global` and it sets the diff --git a/akka-docs/scala/testing.rst b/akka-docs/scala/testing.rst index ba05207975..8a05280580 100644 --- a/akka-docs/scala/testing.rst +++ b/akka-docs/scala/testing.rst @@ -119,7 +119,7 @@ One more special aspect which is overridden for single-threaded tests is the :meth:`receiveTimeout`, as including that would entail asynchronous queuing of :obj:`ReceiveTimeout` messages, violating the synchronous contract. -.. warning:: +.. note:: To summarize: :class:`TestActorRef` overwrites two fields: it sets the dispatcher to :obj:`CallingThreadDispatcher.global` and it sets the From ce9f530c325ed0bbc51f9f7649859cd2e743a2cf Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Thu, 5 Jul 2012 10:53:18 +0200 Subject: [PATCH 30/39] Fix bug in convergence, see #2317 --- akka-cluster/src/main/scala/akka/cluster/Cluster.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 86caf231c3..3b292b0d91 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -419,9 +419,7 @@ case class Gossip( val hasUnreachable = unreachable.nonEmpty && unreachable.exists { _.status != Down } val allMembersInSeen = members.forall(m ⇒ seen.contains(m.address)) - if (hasUnreachable) false - else if (!allMembersInSeen) true - else seen.values.toSet.size == 1 + !hasUnreachable && allMembersInSeen && (seen.values.toSet.size == 1) } def isLeader(address: Address): Boolean = From d0febd7c9ad72adc5d55dc7f5c5dc99bf44d7f6e Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Thu, 5 Jul 2012 11:31:59 +0200 Subject: [PATCH 31/39] Cleaning & restructuring OSGi module --- .../BlueprintActorSystemFactory.scala | 21 ++-- .../aries/blueprint/NamespaceHandler.scala | 112 ++++++++---------- .../osgi/aries/blueprint/ParserHelper.scala | 17 --- .../akka/osgi/ActorSystemActivator.scala | 3 + 4 files changed, 57 insertions(+), 96 deletions(-) delete mode 100644 akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/ParserHelper.scala diff --git a/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/BlueprintActorSystemFactory.scala b/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/BlueprintActorSystemFactory.scala index 40c9d7367b..f45f29b82a 100644 --- a/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/BlueprintActorSystemFactory.scala +++ b/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/BlueprintActorSystemFactory.scala @@ -2,7 +2,8 @@ package akka.osgi.aries.blueprint import org.osgi.framework.BundleContext import akka.osgi.OsgiActorSystemFactory -import com.typesafe.config.ConfigFactory +import akka.actor.ActorSystem +import com.typesafe.config.{ Config, ConfigFactory } /** * A set of helper/factory classes to build a Akka system using Blueprint. This class is only meant to be used by @@ -15,30 +16,22 @@ class BlueprintActorSystemFactory(context: BundleContext, name: String) extends var config: Option[String] = None - lazy val system = super.createActorSystem(stringToOption(name)) + lazy val system: ActorSystem = super.createActorSystem(if (name == null || name.isEmpty) None else Some(name)) - def setConfig(config: String) = { this.config = Some(config) } + def setConfig(config: String): Unit = this.config = Some(config) - def create = system + def create(): ActorSystem = system - def destroy = system.shutdown() - - def stringToOption(original: String) = if (original == null || original.isEmpty) { - None - } else { - Some(original) - } + def destroy(): Unit = system.shutdown() /** * Strategy method to create the Config for the ActorSystem, ensuring that the default/reference configuration is * loaded from the akka-actor bundle. */ - override def actorSystemConfig(context: BundleContext) = { + override def actorSystemConfig(context: BundleContext): Config = config match { case Some(value) ⇒ ConfigFactory.parseString(value).withFallback(super.actorSystemConfig(context)) case None ⇒ super.actorSystemConfig(context) } - - } } diff --git a/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/NamespaceHandler.scala b/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/NamespaceHandler.scala index 0570a027b6..123e92fa82 100644 --- a/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/NamespaceHandler.scala +++ b/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/NamespaceHandler.scala @@ -1,18 +1,30 @@ package akka.osgi.aries.blueprint -import org.apache.aries.blueprint.ParserContext -import org.osgi.service.blueprint.container.ComponentDefinitionException -import org.apache.aries.blueprint.mutable.MutableBeanMetadata - -import collection.JavaConversions.setAsJavaSet import org.osgi.framework.BundleContext +import org.osgi.service.blueprint.container.ComponentDefinitionException +import org.osgi.service.blueprint.reflect.{ BeanMetadata, ComponentMetadata } +import org.apache.aries.blueprint.ParserContext +import org.apache.aries.blueprint.mutable.MutableBeanMetadata import org.apache.aries.blueprint.reflect.{ ValueMetadataImpl, RefMetadataImpl, BeanArgumentImpl } import org.w3c.dom.{ Element, Node } -import org.osgi.service.blueprint.reflect.{ BeanMetadata, ComponentMetadata } -import akka.actor.ActorSystem import java.util.concurrent.atomic.AtomicInteger -import ParserHelper.childElements +import akka.actor.ActorSystem +import scala.annotation.tailrec +import java.net.URL + +object NamespaceHandler { + private val ID_ATTRIBUTE = "id" + private val NAME_ATTRIBUTE = "name" + + private val BUNDLE_CONTEXT_REFID = "blueprintBundleContext" + + private val ACTORSYSTEM_ELEMENT_NAME = "actor-system" + private val CONFIG_ELEMENT_NAME = "config" + + private val DESTROY_METHOD_NAME = "destroy" + private val FACTORY_METHOD_NAME = "create" +} /** * Aries Blueprint namespace handler implementation. This namespace handler will allow users of Apache Aries' Blueprint @@ -40,18 +52,17 @@ class NamespaceHandler extends org.apache.aries.blueprint.NamespaceHandler { import NamespaceHandler._ - val idCounter = new AtomicInteger(0) + protected val idCounter = new AtomicInteger(0) - def getSchemaLocation(namespace: String) = getClass().getResource("akka.xsd") + override def getSchemaLocation(namespace: String): URL = getClass().getResource("akka.xsd") - def getManagedClasses = setAsJavaSet(Set(classOf[BlueprintActorSystemFactory])) + override def getManagedClasses = java.util.Collections.singleton(classOf[BlueprintActorSystemFactory]) - def parse(element: Element, context: ParserContext) = element.getLocalName match { - case ACTORSYSTEM_ELEMENT_NAME ⇒ parseActorSystem(element, context) - case _ ⇒ throw new ComponentDefinitionException("Unexpected element for Akka namespace: %s".format(element)) - } + override def parse(element: Element, context: ParserContext) = + if (element.getLocalName == ACTORSYSTEM_ELEMENT_NAME) parseActorSystem(element, context) + else throw new ComponentDefinitionException("Unexpected element for Akka namespace: %s".format(element)) - def decorate(node: Node, component: ComponentMetadata, context: ParserContext) = + override def decorate(node: Node, component: ComponentMetadata, context: ParserContext) = throw new ComponentDefinitionException("Bad xml syntax: node decoration is not supported") /* @@ -60,11 +71,12 @@ class NamespaceHandler extends org.apache.aries.blueprint.NamespaceHandler { def parseActorSystem(element: Element, context: ParserContext) = { val factory = createFactoryBean(context, element.getAttribute(NAME_ATTRIBUTE)) - for (child ← childElements(element)) { - child.getLocalName match { - case CONFIG_ELEMENT_NAME ⇒ parseConfig(child, context, factory) - case _ ⇒ throw new ComponentDefinitionException("Unexpected child element %s found in %s".format(child, element)) - } + val nodelist = element.getChildNodes + (0 until nodelist.getLength) collect { + case idx if nodelist.item(idx).getNodeType == Node.ELEMENT_NODE ⇒ nodelist.item(idx).asInstanceOf[Element] + } foreach { + case child if child.getLocalName == CONFIG_ELEMENT_NAME ⇒ parseConfig(child, context, factory) + case child ⇒ throw new ComponentDefinitionException("Unexpected child element %s found in %s".format(child, element)) } createActorSystemBean(context, element, factory) @@ -73,16 +85,23 @@ class NamespaceHandler extends org.apache.aries.blueprint.NamespaceHandler { /* * Parse */ - def parseConfig(node: Element, context: ParserContext, factory: MutableBeanMetadata) = { - factory.addProperty("config", new ValueMetadataImpl(node.getTextContent)) - } + protected def parseConfig(node: Element, context: ParserContext, factory: MutableBeanMetadata) = + factory.addProperty(CONFIG_ELEMENT_NAME, new ValueMetadataImpl(node.getTextContent)) + + @tailrec protected final def findAvailableId(context: ParserContext): String = + ".akka-" + idCounter.incrementAndGet() match { + case id if context.getComponentDefinitionRegistry.containsComponentDefinition(id) ⇒ findAvailableId(context) + case available ⇒ available + } /* * Create the bean definition for the ActorSystem */ - def createActorSystemBean(context: ParserContext, element: Element, factory: MutableBeanMetadata): MutableBeanMetadata = { + protected def createActorSystemBean(context: ParserContext, element: Element, factory: MutableBeanMetadata): MutableBeanMetadata = { val system = context.createMetadata(classOf[MutableBeanMetadata]) - system.setId(getId(context, element)) + val id = if (element.hasAttribute(ID_ATTRIBUTE)) element.getAttribute(ID_ATTRIBUTE) else findAvailableId(context) + + system.setId(id) system.setFactoryComponent(factory) system.setFactoryMethod(FACTORY_METHOD_NAME) @@ -93,7 +112,7 @@ class NamespaceHandler extends org.apache.aries.blueprint.NamespaceHandler { /* * Create the bean definition for the BlueprintActorSystemFactory */ - def createFactoryBean(context: ParserContext, name: String): MutableBeanMetadata = { + protected def createFactoryBean(context: ParserContext, name: String): MutableBeanMetadata = { val factory = context.createMetadata(classOf[MutableBeanMetadata]) factory.setId(findAvailableId(context)) factory.setScope(BeanMetadata.SCOPE_SINGLETON) @@ -108,41 +127,4 @@ class NamespaceHandler extends org.apache.aries.blueprint.NamespaceHandler { context.getComponentDefinitionRegistry.registerComponentDefinition(factory) factory } - - /* - * Get the assigned id or generate a suitable id - */ - def getId(context: ParserContext, element: Element) = { - if (element.hasAttribute(ID_ATTRIBUTE)) { - element.getAttribute(ID_ATTRIBUTE) - } else { - findAvailableId(context) - } - } - - /* - * Find the next available component id - */ - def findAvailableId(context: ParserContext): String = { - val id = ".akka-" + idCounter.incrementAndGet() - if (context.getComponentDefinitionRegistry.containsComponentDefinition(id)) { - // id already exists, let's try the next one - findAvailableId(context) - } else id - } -} - -object NamespaceHandler { - - private val ID_ATTRIBUTE = "id" - private val NAME_ATTRIBUTE = "name" - - private val BUNDLE_CONTEXT_REFID = "blueprintBundleContext" - - private val ACTORSYSTEM_ELEMENT_NAME = "actor-system" - private val CONFIG_ELEMENT_NAME = "config" - - private val DESTROY_METHOD_NAME = "destroy" - private val FACTORY_METHOD_NAME = "create" - -} +} \ No newline at end of file diff --git a/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/ParserHelper.scala b/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/ParserHelper.scala deleted file mode 100644 index 585037db09..0000000000 --- a/akka-osgi-aries/src/main/scala/akka/osgi/aries/blueprint/ParserHelper.scala +++ /dev/null @@ -1,17 +0,0 @@ -package akka.osgi.aries.blueprint - -import org.w3c.dom.{ Node, Element } - -/** - * Helper class to deal with the W3C DOM types - */ -object ParserHelper { - - def childElements(element: Element): Seq[Element] = - children(element).filter(_.getNodeType == Node.ELEMENT_NODE).asInstanceOf[Seq[Element]] - - private[this] def children(element: Element): Seq[Node] = { - val nodelist = element.getChildNodes - for (index ← 0 until nodelist.getLength) yield nodelist.item(index) - } -} diff --git a/akka-osgi/src/main/scala/akka/osgi/ActorSystemActivator.scala b/akka-osgi/src/main/scala/akka/osgi/ActorSystemActivator.scala index e279247dbc..fcd7d9aa9b 100644 --- a/akka-osgi/src/main/scala/akka/osgi/ActorSystemActivator.scala +++ b/akka-osgi/src/main/scala/akka/osgi/ActorSystemActivator.scala @@ -52,10 +52,13 @@ abstract class ActorSystemActivator extends BundleActivator { * Register the actor system in the OSGi service registry. The activator itself will ensure that this service * is unregistered again when the bundle is being stopped. * + * Only one ActorSystem can be registered at a time, so any previous registration will be unregistered prior to registering the new. + * * @param context the bundle context * @param system the actor system */ def registerService(context: BundleContext, system: ActorSystem): Unit = { + registration.foreach(_.unregister()) //Cleanup val properties = new Properties() properties.put("name", system.name) registration = Some(context.registerService(classOf[ActorSystem].getName, system, From 17f0ce9f89b15a42355d902b610d55bd6988d132 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Thu, 5 Jul 2012 11:56:54 +0200 Subject: [PATCH 32/39] Add back Cluster JMX, see 2311 * Separate class * Simple test --- .../src/main/scala/akka/cluster/Cluster.scala | 9 +- .../main/scala/akka/cluster/ClusterJmx.scala | 107 ++++++++++++++++++ .../test/scala/akka/cluster/ClusterSpec.scala | 9 ++ 3 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 akka-cluster/src/main/scala/akka/cluster/ClusterJmx.scala diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index 3b292b0d91..da0d7483a8 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -19,11 +19,9 @@ import akka.util.duration._ import akka.util.internal.HashedWheelTimer import com.google.protobuf.ByteString import java.io.Closeable -import java.lang.management.ManagementFactory import java.util.concurrent.atomic.{ AtomicReference, AtomicBoolean } import java.util.concurrent.TimeoutException import java.util.concurrent.TimeUnit._ -import javax.management._ import MemberStatus._ import scala.annotation.tailrec import scala.collection.immutable.{ Map, SortedSet } @@ -1485,6 +1483,9 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) system.registerOnTermination(shutdown()) + private val clusterJmx = new ClusterJmx(this, log) + clusterJmx.createMBean() + log.info("Cluster Node [{}] - has started up successfully", selfAddress) // ====================================================== @@ -1615,6 +1616,8 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) clusterScheduler.close() + clusterJmx.unregisterMBean() + log.info("Cluster Node [{}] - Cluster node successfully shut down", selfAddress) } } @@ -1632,6 +1635,4 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) */ private[cluster] def latestStats: ClusterStats = _latestStats - // FIXME add back JMX - } diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterJmx.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterJmx.scala new file mode 100644 index 0000000000..944d90079b --- /dev/null +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterJmx.scala @@ -0,0 +1,107 @@ +/** + * Copyright (C) 2009-2012 Typesafe Inc. + */ + +package akka.cluster + +import java.lang.management.ManagementFactory +import javax.management.StandardMBean +import akka.event.LoggingAdapter +import akka.actor.AddressFromURIString +import javax.management.ObjectName +import javax.management.InstanceAlreadyExistsException +import javax.management.InstanceNotFoundException + +/** + * Interface for the cluster JMX MBean. + */ +trait ClusterNodeMBean { + def getMemberStatus: String + def getClusterStatus: String + def getLeader: String + + def isSingleton: Boolean + def isConvergence: Boolean + def isAvailable: Boolean + def isRunning: Boolean + + def join(address: String) + def leave(address: String) + def down(address: String) +} + +/** + * Internal API + */ +private[akka] class ClusterJmx(clusterNode: Cluster, log: LoggingAdapter) { + + private val mBeanServer = ManagementFactory.getPlatformMBeanServer + private val clusterMBeanName = new ObjectName("akka:type=Cluster") + + /** + * Creates the cluster JMX MBean and registers it in the MBean server. + */ + def createMBean() = { + val mbean = new StandardMBean(classOf[ClusterNodeMBean]) with ClusterNodeMBean { + + // JMX attributes (bean-style) + + /* + * Sends a string to the JMX client that will list all nodes in the node ring as follows: + * {{{ + * Members: + * Member(address = akka://system0@localhost:5550, status = Up) + * Member(address = akka://system1@localhost:5551, status = Up) + * Unreachable: + * Member(address = akka://system2@localhost:5553, status = Down) + * }}} + */ + def getClusterStatus: String = { + val gossip = clusterNode.latestGossip + val unreachable = gossip.overview.unreachable + val metaData = gossip.meta + "\nMembers:\n\t" + gossip.members.mkString("\n\t") + + { if (unreachable.nonEmpty) "\nUnreachable:\n\t" + unreachable.mkString("\n\t") else "" } + + { if (metaData.nonEmpty) "\nMeta Data:\t" + metaData.toString else "" } + } + + def getMemberStatus: String = clusterNode.status.toString + + def getLeader: String = clusterNode.leader.toString + + def isSingleton: Boolean = clusterNode.isSingletonCluster + + def isConvergence: Boolean = clusterNode.convergence.isDefined + + def isAvailable: Boolean = clusterNode.isAvailable + + def isRunning: Boolean = clusterNode.isRunning + + // JMX commands + + def join(address: String) = clusterNode.join(AddressFromURIString(address)) + + def leave(address: String) = clusterNode.leave(AddressFromURIString(address)) + + def down(address: String) = clusterNode.down(AddressFromURIString(address)) + } + try { + mBeanServer.registerMBean(mbean, clusterMBeanName) + log.info("Cluster Node [{}] - registered cluster JMX MBean [{}]", clusterNode.selfAddress, clusterMBeanName) + } catch { + case e: InstanceAlreadyExistsException ⇒ // ignore - we are running multiple cluster nodes in the same JVM (probably for testing) + } + } + + /** + * Unregisters the cluster JMX MBean from MBean server. + */ + def unregisterMBean(): Unit = { + try { + mBeanServer.unregisterMBean(clusterMBeanName) + } catch { + case e: InstanceNotFoundException ⇒ // ignore - we are running multiple cluster nodes in the same JVM (probably for testing) + } + } + +} diff --git a/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala b/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala index 5812586a3f..f660af3763 100644 --- a/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala +++ b/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala @@ -13,6 +13,8 @@ import akka.actor.Address import java.util.concurrent.atomic.AtomicInteger import akka.remote.RemoteActorRefProvider import InternalClusterAction._ +import java.lang.management.ManagementFactory +import javax.management.ObjectName object ClusterSpec { val config = """ @@ -57,6 +59,13 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with ImplicitSender { cluster.selfAddress must be(selfAddress) } + "register jmx mbean" in { + val name = new ObjectName("akka:type=Cluster") + val info = ManagementFactory.getPlatformMBeanServer.getMBeanInfo(name) + info.getAttributes.length must be > (0) + info.getOperations.length must be > (0) + } + "initially become singleton cluster when joining itself and reach convergence" in { cluster.isSingletonCluster must be(false) // auto-join = off cluster.join(selfAddress) From 20a1e67575239a676cef0862bca1ef19a23c4b6e Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Thu, 5 Jul 2012 12:23:15 +0200 Subject: [PATCH 33/39] Incorparate feedback from review, see #2311 --- .../src/main/scala/akka/cluster/Cluster.scala | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index da0d7483a8..f6055f0a23 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -415,9 +415,17 @@ case class Gossip( // When that is done we check that all the entries in the 'seen' table have the same vector clock version // and that all members exists in seen table val hasUnreachable = unreachable.nonEmpty && unreachable.exists { _.status != Down } - val allMembersInSeen = members.forall(m ⇒ seen.contains(m.address)) + def allMembersInSeen = members.forall(m ⇒ seen.contains(m.address)) - !hasUnreachable && allMembersInSeen && (seen.values.toSet.size == 1) + def seenSame: Boolean = + if (seen.isEmpty) false + else { + val values = seen.values + val seenHead = values.head + values.forall(_ == seenHead) + } + + !hasUnreachable && allMembersInSeen && seenSame } def isLeader(address: Address): Boolean = @@ -434,17 +442,13 @@ case class Gossip( def isUnavailable(address: Address): Boolean = { val isUnreachable = overview.unreachable exists { _.address == address } - val hasUnavailableMemberStatus = members exists { m ⇒ (m.address == address) && m.status.isUnavailable } + val hasUnavailableMemberStatus = members exists { m ⇒ m.status.isUnavailable && m.address == address } isUnreachable || hasUnavailableMemberStatus } def member(address: Address): Member = { - members.find(_.address == address) - .getOrElse { - overview.unreachable - .find(_.address == address) - .getOrElse(Member(address, Removed)) - } + members.find(_.address == address).orElse(overview.unreachable.find(_.address == address)). + getOrElse(Member(address, Removed)) } override def toString = From 01ee8e8fbf45acc10e04bf3923bb7de2e5221a30 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Thu, 5 Jul 2012 13:55:08 +0200 Subject: [PATCH 34/39] Separate Cluster to several files, see #2311 * Introduced ClusterEnvironment trait to make it easier to test the actors without using the extension * Incorparate more feedback from review --- .../src/main/scala/akka/cluster/Cluster.scala | 1396 +---------------- .../scala/akka/cluster/ClusterDaemon.scala | 926 +++++++++++ .../scala/akka/cluster/ClusterHeartbeat.scala | 135 ++ .../src/main/scala/akka/cluster/Gossip.scala | 212 +++ .../src/main/scala/akka/cluster/Member.scala | 117 ++ .../cluster/RemoteConnectionManager.scala | 150 -- 6 files changed, 1442 insertions(+), 1494 deletions(-) create mode 100644 akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala create mode 100644 akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala create mode 100644 akka-cluster/src/main/scala/akka/cluster/Gossip.scala create mode 100644 akka-cluster/src/main/scala/akka/cluster/Member.scala delete mode 100644 akka-cluster/src/main/scala/akka/cluster/RemoteConnectionManager.scala diff --git a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala index f6055f0a23..a2c64b75cd 100644 --- a/akka-cluster/src/main/scala/akka/cluster/Cluster.scala +++ b/akka-cluster/src/main/scala/akka/cluster/Cluster.scala @@ -4,1347 +4,33 @@ package akka.cluster -import akka.actor._ -import akka.actor.Status._ +import java.io.Closeable +import java.util.concurrent.atomic.AtomicBoolean +import java.util.concurrent.atomic.AtomicReference + +import scala.annotation.tailrec +import scala.collection.immutable.SortedSet + import akka.ConfigurationException +import akka.actor.ActorRef +import akka.actor.ActorSystem +import akka.actor.ActorSystemImpl +import akka.actor.Address +import akka.actor.Cancellable +import akka.actor.DefaultScheduler +import akka.actor.ExtendedActorSystem +import akka.actor.Extension +import akka.actor.ExtensionId +import akka.actor.ExtensionIdProvider +import akka.actor.Props +import akka.actor.Scheduler import akka.dispatch.Await import akka.dispatch.MonitorableThreadFactory import akka.event.Logging -import akka.jsr166y.ThreadLocalRandom -import akka.pattern._ -import akka.remote._ -import akka.routing._ -import akka.util._ -import akka.util.duration._ +import akka.pattern.ask +import akka.remote.RemoteActorRefProvider +import akka.util.Duration import akka.util.internal.HashedWheelTimer -import com.google.protobuf.ByteString -import java.io.Closeable -import java.util.concurrent.atomic.{ AtomicReference, AtomicBoolean } -import java.util.concurrent.TimeoutException -import java.util.concurrent.TimeUnit._ -import MemberStatus._ -import scala.annotation.tailrec -import scala.collection.immutable.{ Map, SortedSet } -import scala.collection.GenTraversableOnce -import java.util.concurrent.atomic.AtomicLong -import java.security.MessageDigest - -/** - * Interface for membership change listener. - */ -trait MembershipChangeListener { - def notify(members: SortedSet[Member]): Unit -} - -/** - * Interface for meta data change listener. - */ -trait MetaDataChangeListener { - def notify(meta: Map[String, Array[Byte]]): Unit -} - -/** - * Base trait for all cluster messages. All ClusterMessage's are serializable. - * - * FIXME Protobuf all ClusterMessages - */ -sealed trait ClusterMessage extends Serializable - -/** - * Cluster commands sent by the USER. - */ -object ClusterUserAction { - - /** - * Command to join the cluster. Sent when a node (represented by 'address') - * wants to join another node (the receiver). - */ - case class Join(address: Address) extends ClusterMessage - - /** - * Command to leave the cluster. - */ - case class Leave(address: Address) extends ClusterMessage - - /** - * Command to mark node as temporary down. - */ - case class Down(address: Address) extends ClusterMessage - -} - -/** - * INTERNAL API - */ -private[cluster] object InternalClusterAction { - - /** - * Command to initiate join another node (represented by 'address'). - * Join will be sent to the other node. - */ - case class JoinTo(address: Address) extends ClusterMessage - - /** - * Start message of the process to join one of the seed nodes. - * The node sends `InitJoin` to all seed nodes, which replies - * with `InitJoinAck`. The first reply is used others are discarded. - * The node sends `Join` command to the seed node that replied first. - */ - case object JoinSeedNode extends ClusterMessage - - /** - * @see JoinSeedNode - */ - case object InitJoin extends ClusterMessage - - /** - * @see JoinSeedNode - */ - case class InitJoinAck(address: Address) extends ClusterMessage - - /** - * - * Command to [akka.cluster.ClusterHeartbeatSender]], which will send [[akka.cluster.Heartbeat]] - * to the other node. - * Local only, no need to serialize. - */ - case class SendHeartbeat(heartbeatMsg: Heartbeat, to: Address, deadline: Deadline) - - case object GossipTick - - case object HeartbeatTick - - case object ReapUnreachableTick - - case object LeaderActionsTick - - case object PublishStateTick - - case class SendClusterMessage(to: Address, msg: ClusterMessage) - - case class SendGossipTo(address: Address) - - case object GetClusterCoreRef - - case class Ping(timestamp: Long = System.currentTimeMillis) extends ClusterMessage - case class Pong(ping: Ping, timestamp: Long = System.currentTimeMillis) extends ClusterMessage - -} - -/** - * INTERNAL API. - * - * Cluster commands sent by the LEADER. - */ -private[cluster] object ClusterLeaderAction { - - /** - * Command to mark a node to be removed from the cluster immediately. - * Can only be sent by the leader. - */ - case class Exit(address: Address) extends ClusterMessage - - /** - * Command to remove a node from the cluster immediately. - */ - case class Remove(address: Address) extends ClusterMessage -} - -/** - * Represents the address and the current status of a cluster member node. - * - * Note: `hashCode` and `equals` are solely based on the underlying `Address`, not its `MemberStatus`. - */ -class Member(val address: Address, val status: MemberStatus) extends ClusterMessage { - override def hashCode = address.## - override def equals(other: Any) = Member.unapply(this) == Member.unapply(other) - override def toString = "Member(address = %s, status = %s)" format (address, status) - def copy(address: Address = this.address, status: MemberStatus = this.status): Member = new Member(address, status) -} - -/** - * Module with factory and ordering methods for Member instances. - */ -object Member { - - /** - * `Address` ordering type class, sorts addresses by host and port. - */ - implicit val addressOrdering: Ordering[Address] = Ordering.fromLessThan[Address] { (a, b) ⇒ - if (a.host != b.host) a.host.getOrElse("").compareTo(b.host.getOrElse("")) < 0 - else if (a.port != b.port) a.port.getOrElse(0) < b.port.getOrElse(0) - else false - } - - /** - * `Member` ordering type class, sorts members by host and port with the exception that - * it puts all members that are in MemberStatus.EXITING last. - */ - implicit val ordering: Ordering[Member] = Ordering.fromLessThan[Member] { (a, b) ⇒ - if (a.status == Exiting && b.status != Exiting) false - else if (a.status != Exiting && b.status == Exiting) true - else addressOrdering.compare(a.address, b.address) < 0 - } - - def apply(address: Address, status: MemberStatus): Member = new Member(address, status) - - def unapply(other: Any) = other match { - case m: Member ⇒ Some(m.address) - case _ ⇒ None - } - - def pickHighestPriority(a: Set[Member], b: Set[Member]): Set[Member] = { - // group all members by Address => Seq[Member] - val groupedByAddress = (a.toSeq ++ b.toSeq).groupBy(_.address) - // pick highest MemberStatus - (Set.empty[Member] /: groupedByAddress) { - case (acc, (_, members)) ⇒ acc + members.reduceLeft(highestPriorityOf) - } - } - - /** - * Picks the Member with the highest "priority" MemberStatus. - */ - def highestPriorityOf(m1: Member, m2: Member): Member = (m1.status, m2.status) match { - case (Removed, _) ⇒ m1 - case (_, Removed) ⇒ m2 - case (Down, _) ⇒ m1 - case (_, Down) ⇒ m2 - case (Exiting, _) ⇒ m1 - case (_, Exiting) ⇒ m2 - case (Leaving, _) ⇒ m1 - case (_, Leaving) ⇒ m2 - case (Up, Joining) ⇒ m2 - case (Joining, Up) ⇒ m1 - case (Joining, Joining) ⇒ m1 - case (Up, Up) ⇒ m1 - } - - // FIXME Workaround for https://issues.scala-lang.org/browse/SI-5986 - // SortedSet + and ++ operators replaces existing element - // Use these :+ and :++ operators for the Gossip members - implicit def sortedSetWorkaround(sortedSet: SortedSet[Member]): SortedSetWorkaround = new SortedSetWorkaround(sortedSet) - class SortedSetWorkaround(sortedSet: SortedSet[Member]) { - implicit def :+(elem: Member): SortedSet[Member] = { - if (sortedSet.contains(elem)) sortedSet - else sortedSet + elem - } - - implicit def :++(elems: GenTraversableOnce[Member]): SortedSet[Member] = - sortedSet ++ (elems.toSet diff sortedSet) - } -} - -/** - * Envelope adding a sender address to the gossip. - */ -case class GossipEnvelope(from: Address, gossip: Gossip, conversation: Boolean = true) extends ClusterMessage - -/** - * When conflicting versions of received and local [[akka.cluster.Gossip]] is detected - * it's forwarded to the leader for conflict resolution. - */ -case class GossipMergeConflict(a: GossipEnvelope, b: GossipEnvelope) extends ClusterMessage - -/** - * Defines the current status of a cluster member node - * - * Can be one of: Joining, Up, Leaving, Exiting and Down. - */ -sealed trait MemberStatus extends ClusterMessage { - - /** - * Using the same notion for 'unavailable' as 'non-convergence': DOWN - */ - def isUnavailable: Boolean = this == Down -} - -object MemberStatus { - case object Joining extends MemberStatus - case object Up extends MemberStatus - case object Leaving extends MemberStatus - case object Exiting extends MemberStatus - case object Down extends MemberStatus - case object Removed extends MemberStatus -} - -/** - * Represents the overview of the cluster, holds the cluster convergence table and set with unreachable nodes. - */ -case class GossipOverview( - seen: Map[Address, VectorClock] = Map.empty, - unreachable: Set[Member] = Set.empty) { - - def isNonDownUnreachable(address: Address): Boolean = - unreachable.exists { m ⇒ m.address == address && m.status != Down } - - override def toString = - "GossipOverview(seen = [" + seen.mkString(", ") + - "], unreachable = [" + unreachable.mkString(", ") + - "])" -} - -object Gossip { - val emptyMembers: SortedSet[Member] = SortedSet.empty - -} - -/** - * Represents the state of the cluster; cluster ring membership, ring convergence, meta data - - * all versioned by a vector clock. - * - * When a node is joining the `Member`, with status `Joining`, is added to `members`. - * If the joining node was downed it is moved from `overview.unreachable` (status `Down`) - * to `members` (status `Joining`). It cannot rejoin if not first downed. - * - * When convergence is reached the leader change status of `members` from `Joining` - * to `Up`. - * - * When failure detector consider a node as unavailable it will be moved from - * `members` to `overview.unreachable`. - * - * When a node is downed, either manually or automatically, its status is changed to `Down`. - * It is also removed from `overview.seen` table. The node will reside as `Down` in the - * `overview.unreachable` set until joining again and it will then go through the normal - * joining procedure. - * - * When a `Gossip` is received the version (vector clock) is used to determine if the - * received `Gossip` is newer or older than the current local `Gossip`. The received `Gossip` - * and local `Gossip` is merged in case of conflicting version, i.e. vector clocks without - * same history. When merged the seen table is cleared. - * - * When a node is told by the user to leave the cluster the leader will move it to `Leaving` - * and then rebalance and repartition the cluster and start hand-off by migrating the actors - * from the leaving node to the new partitions. Once this process is complete the leader will - * move the node to the `Exiting` state and once a convergence is complete move the node to - * `Removed` by removing it from the `members` set and sending a `Removed` command to the - * removed node telling it to shut itself down. - */ -case class Gossip( - overview: GossipOverview = GossipOverview(), - members: SortedSet[Member] = Gossip.emptyMembers, // sorted set of members with their status, sorted by address - meta: Map[String, Array[Byte]] = Map.empty, - version: VectorClock = VectorClock()) // vector clock version - extends ClusterMessage // is a serializable cluster message - with Versioned[Gossip] { - - // FIXME can be disabled as optimization - assertInvariants - - private def assertInvariants: Unit = { - val unreachableAndLive = members.intersect(overview.unreachable) - if (unreachableAndLive.nonEmpty) - throw new IllegalArgumentException("Same nodes in both members and unreachable is not allowed, got [%s]" - format unreachableAndLive.mkString(", ")) - - val allowedLiveMemberStatuses: Set[MemberStatus] = Set(Joining, Up, Leaving, Exiting) - def hasNotAllowedLiveMemberStatus(m: Member) = !allowedLiveMemberStatuses.contains(m.status) - if (members exists hasNotAllowedLiveMemberStatus) - throw new IllegalArgumentException("Live members must have status [%s], got [%s]" - format (allowedLiveMemberStatuses.mkString(", "), - (members filter hasNotAllowedLiveMemberStatus).mkString(", "))) - - val seenButNotMember = overview.seen.keySet -- members.map(_.address) -- overview.unreachable.map(_.address) - if (seenButNotMember.nonEmpty) - throw new IllegalArgumentException("Nodes not part of cluster have marked the Gossip as seen, got [%s]" - format seenButNotMember.mkString(", ")) - - } - - /** - * Increments the version for this 'Node'. - */ - def :+(node: VectorClock.Node): Gossip = copy(version = version :+ node) - - /** - * Adds a member to the member node ring. - */ - def :+(member: Member): Gossip = { - if (members contains member) this - else this copy (members = members :+ member) - } - - /** - * Marks the gossip as seen by this node (address) by updating the address entry in the 'gossip.overview.seen' - * Map with the VectorClock (version) for the new gossip. - */ - def seen(address: Address): Gossip = { - if (overview.seen.contains(address) && overview.seen(address) == version) this - else this copy (overview = overview copy (seen = overview.seen + (address -> version))) - } - - /** - * Merges two Gossip instances including membership tables, meta-data tables and the VectorClock histories. - */ - def merge(that: Gossip): Gossip = { - import Member.ordering - - // 1. merge vector clocks - val mergedVClock = this.version merge that.version - - // 2. merge meta-data - val mergedMeta = this.meta ++ that.meta - - // 3. merge unreachable by selecting the single Member with highest MemberStatus out of the Member groups - val mergedUnreachable = Member.pickHighestPriority(this.overview.unreachable, that.overview.unreachable) - - // 4. merge members by selecting the single Member with highest MemberStatus out of the Member groups, - // and exclude unreachable - val mergedMembers = Gossip.emptyMembers :++ Member.pickHighestPriority(this.members, that.members).filterNot(mergedUnreachable.contains) - - // 5. fresh seen table - val mergedSeen = Map.empty[Address, VectorClock] - - Gossip(GossipOverview(mergedSeen, mergedUnreachable), mergedMembers, mergedMeta, mergedVClock) - } - - /** - * Checks if we have a cluster convergence. If there are any unreachable nodes then we can't have a convergence - - * waiting for user to act (issuing DOWN) or leader to act (issuing DOWN through auto-down). - * - * @returns Some(convergedGossip) if convergence have been reached and None if not - */ - def convergence: Boolean = { - val unreachable = overview.unreachable - val seen = overview.seen - - // First check that: - // 1. we don't have any members that are unreachable, or - // 2. all unreachable members in the set have status DOWN - // Else we can't continue to check for convergence - // When that is done we check that all the entries in the 'seen' table have the same vector clock version - // and that all members exists in seen table - val hasUnreachable = unreachable.nonEmpty && unreachable.exists { _.status != Down } - def allMembersInSeen = members.forall(m ⇒ seen.contains(m.address)) - - def seenSame: Boolean = - if (seen.isEmpty) false - else { - val values = seen.values - val seenHead = values.head - values.forall(_ == seenHead) - } - - !hasUnreachable && allMembersInSeen && seenSame - } - - def isLeader(address: Address): Boolean = - members.nonEmpty && (address == members.head.address) - - def leader: Option[Address] = members.headOption.map(_.address) - - def isSingletonCluster: Boolean = members.size == 1 - - /** - * Returns true if the node is UP or JOINING. - */ - def isAvailable(address: Address): Boolean = !isUnavailable(address) - - def isUnavailable(address: Address): Boolean = { - val isUnreachable = overview.unreachable exists { _.address == address } - val hasUnavailableMemberStatus = members exists { m ⇒ m.status.isUnavailable && m.address == address } - isUnreachable || hasUnavailableMemberStatus - } - - def member(address: Address): Member = { - members.find(_.address == address).orElse(overview.unreachable.find(_.address == address)). - getOrElse(Member(address, Removed)) - } - - override def toString = - "Gossip(" + - "overview = " + overview + - ", members = [" + members.mkString(", ") + - "], meta = [" + meta.mkString(", ") + - "], version = " + version + - ")" -} - -/** - * Sent at regular intervals for failure detection. - */ -case class Heartbeat(from: Address) extends ClusterMessage - -/** - * INTERNAL API - */ -private[cluster] case class ClusterStats( - receivedGossipCount: Long = 0L, - mergeConflictCount: Long = 0L, - mergeCount: Long = 0L, - mergeDetectedCount: Long = 0L) { - - def incrementReceivedGossipCount(): ClusterStats = - copy(receivedGossipCount = receivedGossipCount + 1) - - def incrementMergeConflictCount(): ClusterStats = - copy(mergeConflictCount = mergeConflictCount + 1) - - def incrementMergeCount(): ClusterStats = - copy(mergeCount = mergeCount + 1) - - def incrementMergeDetectedCount(): ClusterStats = - copy(mergeDetectedCount = mergeDetectedCount + 1) -} - -/** - * INTERNAL API. - * - * Receives Heartbeat messages and delegates to Cluster. - * Instantiated as a single instance for each Cluster - e.g. heartbeats are serialized - * to Cluster message after message, but concurrent with other types of messages. - */ -private[cluster] final class ClusterHeartbeatDaemon(cluster: Cluster) extends Actor with ActorLogging { - - def receive = { - case Heartbeat(from) ⇒ cluster.failureDetector heartbeat from - } - -} - -/* - * This actor is responsible for sending the heartbeat messages to - * other nodes. Netty blocks when sending to broken connections. This actor - * isolates sending to different nodes by using child workers for each target - * address and thereby reduce the risk of irregular heartbeats to healty - * nodes due to broken connections to other nodes. - */ -private[cluster] final class ClusterHeartbeatSender(cluster: Cluster) extends Actor with ActorLogging { - - import InternalClusterAction._ - - /** - * Looks up and returns the remote cluster heartbeat connection for the specific address. - */ - def clusterHeartbeatConnectionFor(address: Address): ActorRef = - context.system.actorFor(RootActorPath(address) / "system" / "cluster" / "heartbeat") - - val digester = MessageDigest.getInstance("MD5") - - /** - * Child name is MD5 hash of the address. - * FIXME Change to URLEncode when ticket #2123 has been fixed - */ - def encodeChildName(name: String): String = { - digester update name.getBytes("UTF-8") - digester.digest.map { h ⇒ "%02x".format(0xFF & h) }.mkString - } - - def receive = { - case msg @ SendHeartbeat(from, to, deadline) ⇒ - val workerName = encodeChildName(to.toString) - val worker = context.actorFor(workerName) match { - case notFound if notFound.isTerminated ⇒ - context.actorOf(Props(new ClusterHeartbeatSenderWorker( - cluster.settings.SendCircuitBreakerSettings, clusterHeartbeatConnectionFor(to))), workerName) - case child ⇒ child - } - worker ! msg - } - -} - -/** - * Responsible for sending [[akka.cluster.Heartbeat]] to one specific address. - * - * Netty blocks when sending to broken connections, and this actor uses - * a configurable circuit breaker to reduce connect attempts to broken - * connections. - * - * @see ClusterHeartbeatSender - */ -private[cluster] final class ClusterHeartbeatSenderWorker( - cbSettings: CircuitBreakerSettings, toRef: ActorRef) - extends Actor with ActorLogging { - - import InternalClusterAction._ - - val breaker = CircuitBreaker(context.system.scheduler, - cbSettings.maxFailures, cbSettings.callTimeout, cbSettings.resetTimeout). - onHalfOpen(log.debug("CircuitBreaker Half-Open for: [{}]", toRef)). - onOpen(log.debug("CircuitBreaker Open for [{}]", toRef)). - onClose(log.debug("CircuitBreaker Closed for [{}]", toRef)) - - context.setReceiveTimeout(30 seconds) - - def receive = { - case SendHeartbeat(heartbeatMsg, _, deadline) ⇒ - if (!deadline.isOverdue) { - // the CircuitBreaker will measure elapsed time and open if too many long calls - try breaker.withSyncCircuitBreaker { - log.debug("Cluster Node [{}] - Heartbeat to [{}]", heartbeatMsg.from, toRef) - toRef ! heartbeatMsg - if (deadline.isOverdue) log.debug("Sending heartbeat to [{}] took longer than expected", toRef) - } catch { case e: CircuitBreakerOpenException ⇒ /* skip sending heartbeat to broken connection */ } - - // make sure it will cleanup when not used any more - context.setReceiveTimeout(30 seconds) - } - - case ReceiveTimeout ⇒ context.stop(self) // cleanup when not used - - } -} - -/** - * INTERNAL API. - */ -private[cluster] final class ClusterCoreSender(selfAddress: Address) extends Actor with ActorLogging { - import InternalClusterAction._ - - /** - * Looks up and returns the remote cluster command connection for the specific address. - */ - private def clusterCoreConnectionFor(address: Address): ActorRef = - context.system.actorFor(RootActorPath(address) / "system" / "cluster" / "core") - - def receive = { - case SendClusterMessage(to, msg) ⇒ - log.debug("Cluster Node [{}] - Trying to send [{}] to [{}]", selfAddress, msg.getClass.getSimpleName, to) - clusterCoreConnectionFor(to) ! msg - } -} - -/** - * INTERNAL API. - */ -private[cluster] final class ClusterCore(cluster: Cluster) extends Actor with ActorLogging { - // FIXME break up the cluster constructor parameter into something that is easier to test without Cluster - import ClusterLeaderAction._ - import InternalClusterAction._ - - import cluster.settings._ - import cluster.selfAddress - import cluster.clusterScheduler - - val vclockNode = VectorClock.Node(selfAddress.toString) - val selfHeartbeat = Heartbeat(selfAddress) - - // note that self is not initially member, - // and the Gossip is not versioned for this 'Node' yet - var latestGossip: Gossip = Gossip() - var joinInProgress: Map[Address, Deadline] = Map.empty - - var stats = ClusterStats() - - val heartbeatSender = context.actorOf(Props(new ClusterHeartbeatSender(cluster)). - withDispatcher(UseDispatcher), name = "heartbeatSender") - val coreSender = context.actorOf(Props(new ClusterCoreSender(selfAddress)). - withDispatcher(UseDispatcher), name = "coreSender") - - // start periodic gossip to random nodes in cluster - val gossipTask = - FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(GossipInterval), GossipInterval) { - self ! GossipTick - } - - // start periodic heartbeat to all nodes in cluster - val heartbeatTask = - FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(HeartbeatInterval), HeartbeatInterval) { - self ! HeartbeatTick - } - - // start periodic cluster failure detector reaping (moving nodes condemned by the failure detector to unreachable list) - val failureDetectorReaperTask = - FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(UnreachableNodesReaperInterval), UnreachableNodesReaperInterval) { - self ! ReapUnreachableTick - } - - // start periodic leader action management (only applies for the current leader) - private val leaderActionsTask = - FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(LeaderActionsInterval), LeaderActionsInterval) { - self ! LeaderActionsTick - } - - // start periodic publish of current state - private val publishStateTask: Option[Cancellable] = - if (PublishStateInterval == Duration.Zero) None - else Some(FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(PublishStateInterval), PublishStateInterval) { - self ! PublishStateTick - }) - - override def preStart(): Unit = { - if (AutoJoin) self ! InternalClusterAction.JoinSeedNode - } - - override def postStop(): Unit = { - gossipTask.cancel() - heartbeatTask.cancel() - failureDetectorReaperTask.cancel() - leaderActionsTask.cancel() - publishStateTask foreach { _.cancel() } - } - - def receive = { - case JoinSeedNode ⇒ joinSeedNode() - case InitJoin ⇒ initJoin() - case InitJoinAck(address) ⇒ join(address) - case Failure(e: AskTimeoutException) ⇒ joinSeedNodeTimeout() - case JoinTo(address) ⇒ join(address) - case ClusterUserAction.Join(address) ⇒ joining(address) - case ClusterUserAction.Down(address) ⇒ downing(address) - case ClusterUserAction.Leave(address) ⇒ leaving(address) - case Exit(address) ⇒ exiting(address) - case Remove(address) ⇒ removing(address) - case msg: GossipEnvelope ⇒ receiveGossip(msg) - case msg: GossipMergeConflict ⇒ receiveGossipMerge(msg) - case GossipTick ⇒ gossip() - case HeartbeatTick ⇒ heartbeat() - case ReapUnreachableTick ⇒ reapUnreachableMembers() - case LeaderActionsTick ⇒ leaderActions() - case SendGossipTo(address) ⇒ gossipTo(address) - case PublishStateTick ⇒ publishState() - case p: Ping ⇒ ping(p) - - } - - def joinSeedNode(): Unit = { - val seedRoutees = for (address ← cluster.seedNodes; if address != cluster.selfAddress) - yield self.path.toStringWithAddress(address) - if (seedRoutees.isEmpty) { - cluster join cluster.selfAddress - } else { - implicit val within = Timeout(cluster.settings.SeedNodeTimeout) - val seedRouter = context.actorOf( - Props.empty.withRouter(ScatterGatherFirstCompletedRouter( - routees = seedRoutees, within = within.duration))) - seedRouter ? InitJoin pipeTo self - seedRouter ! PoisonPill - } - } - - def initJoin(): Unit = sender ! InitJoinAck(cluster.selfAddress) - - def joinSeedNodeTimeout(): Unit = cluster join cluster.selfAddress - - /** - * Try to join this cluster node with the node specified by 'address'. - * A 'Join(thisNodeAddress)' command is sent to the node to join. - */ - def join(address: Address): Unit = { - val localGossip = latestGossip - // wipe our state since a node that joins a cluster must be empty - latestGossip = Gossip() - joinInProgress = Map.empty + (address -> (Deadline.now + JoinTimeout)) - - // wipe the failure detector since we are starting fresh and shouldn't care about the past - cluster.failureDetector.reset() - - notifyListeners(localGossip) - - val command = ClusterUserAction.Join(selfAddress) - coreSender ! SendClusterMessage(address, command) - } - - /** - * State transition to JOINING - new node joining. - */ - def joining(node: Address): Unit = { - val localGossip = latestGossip - val localMembers = localGossip.members - val localUnreachable = localGossip.overview.unreachable - - val alreadyMember = localMembers.exists(_.address == node) - val isUnreachable = localGossip.overview.isNonDownUnreachable(node) - - if (!alreadyMember && !isUnreachable) { - - // remove the node from the 'unreachable' set in case it is a DOWN node that is rejoining cluster - val (rejoiningMember, newUnreachableMembers) = localUnreachable partition { _.address == node } - val newOverview = localGossip.overview copy (unreachable = newUnreachableMembers) - - // remove the node from the failure detector if it is a DOWN node that is rejoining cluster - if (rejoiningMember.nonEmpty) cluster.failureDetector.remove(node) - - // add joining node as Joining - // add self in case someone else joins before self has joined (Set discards duplicates) - val newMembers = localMembers :+ Member(node, Joining) :+ Member(selfAddress, Joining) - val newGossip = localGossip copy (overview = newOverview, members = newMembers) - - val versionedGossip = newGossip :+ vclockNode - val seenVersionedGossip = versionedGossip seen selfAddress - - latestGossip = seenVersionedGossip - - log.debug("Cluster Node [{}] - Node [{}] is JOINING", selfAddress, node) - // treat join as initial heartbeat, so that it becomes unavailable if nothing more happens - if (node != selfAddress) { - cluster.failureDetector heartbeat node - gossipTo(node) - } - - notifyListeners(localGossip) - } - } - - /** - * State transition to LEAVING. - */ - def leaving(address: Address): Unit = { - val localGossip = latestGossip - if (localGossip.members.exists(_.address == address)) { // only try to update if the node is available (in the member ring) - val newMembers = localGossip.members map { member ⇒ if (member.address == address) Member(address, Leaving) else member } // mark node as LEAVING - val newGossip = localGossip copy (members = newMembers) - - val versionedGossip = newGossip :+ vclockNode - val seenVersionedGossip = versionedGossip seen selfAddress - - latestGossip = seenVersionedGossip - - log.info("Cluster Node [{}] - Marked address [{}] as LEAVING", selfAddress, address) - notifyListeners(localGossip) - } - } - - /** - * State transition to EXITING. - */ - def exiting(address: Address): Unit = { - log.info("Cluster Node [{}] - Marked node [{}] as EXITING", selfAddress, address) - // FIXME implement when we implement hand-off - } - - /** - * State transition to REMOVED. - * - * This method is for now only called after the LEADER have sent a Removed message - telling the node - * to shut down himself. - * - * In the future we might change this to allow the USER to send a Removed(address) message telling an - * arbitrary node to be moved direcly from UP -> REMOVED. - */ - def removing(address: Address): Unit = { - log.info("Cluster Node [{}] - Node has been REMOVED by the leader - shutting down...", selfAddress) - val localGossip = latestGossip - // just cleaning up the gossip state - latestGossip = Gossip() - // make sure the final (removed) state is always published - notifyListeners(localGossip) - cluster.shutdown() - } - - /** - * The node to DOWN is removed from the 'members' set and put in the 'unreachable' set (if not already there) - * and its status is set to DOWN. The node is also removed from the 'seen' table. - * - * The node will reside as DOWN in the 'unreachable' set until an explicit command JOIN command is sent directly - * to this node and it will then go through the normal JOINING procedure. - */ - def downing(address: Address): Unit = { - val localGossip = latestGossip - val localMembers = localGossip.members - val localOverview = localGossip.overview - val localSeen = localOverview.seen - val localUnreachableMembers = localOverview.unreachable - - // 1. check if the node to DOWN is in the 'members' set - val downedMember: Option[Member] = localMembers.collectFirst { - case m if m.address == address ⇒ m.copy(status = Down) - } - val newMembers = downedMember match { - case Some(m) ⇒ - log.info("Cluster Node [{}] - Marking node [{}] as DOWN", selfAddress, m.address) - localMembers - m - case None ⇒ localMembers - } - - // 2. check if the node to DOWN is in the 'unreachable' set - val newUnreachableMembers = - localUnreachableMembers.map { member ⇒ - // no need to DOWN members already DOWN - if (member.address == address && member.status != Down) { - log.info("Cluster Node [{}] - Marking unreachable node [{}] as DOWN", selfAddress, member.address) - member copy (status = Down) - } else member - } - - // 3. add the newly DOWNED members from the 'members' (in step 1.) to the 'newUnreachableMembers' set. - val newUnreachablePlusNewlyDownedMembers = newUnreachableMembers ++ downedMember - - // 4. remove nodes marked as DOWN from the 'seen' table - val newSeen = localSeen -- newUnreachablePlusNewlyDownedMembers.collect { - case m if m.status == Down ⇒ m.address - } - - // update gossip overview - val newOverview = localOverview copy (seen = newSeen, unreachable = newUnreachablePlusNewlyDownedMembers) - val newGossip = localGossip copy (overview = newOverview, members = newMembers) // update gossip - val versionedGossip = newGossip :+ vclockNode - latestGossip = versionedGossip seen selfAddress - - notifyListeners(localGossip) - } - - /** - * When conflicting versions of received and local [[akka.cluster.Gossip]] is detected - * it's forwarded to the leader for conflict resolution. Trying to simultaneously - * resolving conflicts at several nodes creates new conflicts. Therefore the leader resolves - * conflicts to limit divergence. To avoid overload there is also a configurable rate - * limit of how many conflicts that are handled by second. If the limit is - * exceeded the conflicting gossip messages are dropped and will reappear later. - */ - def receiveGossipMerge(merge: GossipMergeConflict): Unit = { - stats = stats.incrementMergeConflictCount - val rate = mergeRate(stats.mergeConflictCount) - if (rate <= MaxGossipMergeRate) { - receiveGossip(merge.a.copy(conversation = false)) - receiveGossip(merge.b.copy(conversation = false)) - - // use one-way gossip from leader to reduce load of leader - def sendBack(to: Address): Unit = { - if (to != selfAddress && !latestGossip.overview.unreachable.exists(_.address == to)) - oneWayGossipTo(to) - } - - sendBack(merge.a.from) - sendBack(merge.b.from) - - } else { - log.debug("Dropping gossip merge conflict due to rate [{}] / s ", rate) - } - } - - /** - * Receive new gossip. - */ - def receiveGossip(envelope: GossipEnvelope): Unit = { - val from = envelope.from - val remoteGossip = envelope.gossip - val localGossip = latestGossip - - if (remoteGossip.overview.unreachable.exists(_.address == selfAddress)) { - // FIXME how should we handle this situation? - log.debug("Received gossip with self as unreachable, from [{}]", from) - - } else if (!localGossip.overview.isNonDownUnreachable(from)) { - - // leader handles merge conflicts, or when they have different views of how is leader - val handleMerge = localGossip.leader == Some(selfAddress) || localGossip.leader != remoteGossip.leader - val conflict = remoteGossip.version <> localGossip.version - - if (conflict && !handleMerge) { - // delegate merge resolution to leader to reduce number of simultaneous resolves, - // which will result in new conflicts - - stats = stats.incrementMergeDetectedCount - log.debug("Merge conflict [{}] detected [{}] <> [{}]", stats.mergeDetectedCount, selfAddress, from) - - stats = stats.incrementMergeConflictCount - val rate = mergeRate(stats.mergeConflictCount) - if (rate <= MaxGossipMergeRate) { - coreSender ! SendClusterMessage( - to = localGossip.leader.get, - msg = GossipMergeConflict(GossipEnvelope(selfAddress, localGossip), envelope)) - } else { - log.debug("Skipping gossip merge conflict due to rate [{}] / s ", rate) - } - - } else { - - val winningGossip = - - if (conflict) { - // conflicting versions, merge, and new version - val mergedGossip = remoteGossip merge localGossip - mergedGossip :+ vclockNode - - } else if (remoteGossip.version < localGossip.version) { - // local gossip is newer - localGossip - - } else if (!remoteGossip.members.exists(_.address == selfAddress)) { - // FIXME This is a very strange. It can happen when many nodes join at the same time. - // It's not detected as an ordinary version conflict <> - // If we don't handle this situation there will be IllegalArgumentException when marking this as seen - // merge, and new version - val mergedGossip = remoteGossip merge (localGossip :+ Member(selfAddress, Joining)) - mergedGossip :+ vclockNode - - } else { - // remote gossip is newer - remoteGossip - - } - - val newJoinInProgress = - if (joinInProgress.isEmpty) joinInProgress - else joinInProgress -- - winningGossip.members.map(_.address) -- - winningGossip.overview.unreachable.map(_.address) - - latestGossip = winningGossip seen selfAddress - joinInProgress = newJoinInProgress - - // for all new joining nodes we remove them from the failure detector - (latestGossip.members -- localGossip.members).filter(_.status == Joining).foreach { node ⇒ - cluster.failureDetector.remove(node.address) - } - - log.debug("Cluster Node [{}] - Receiving gossip from [{}]", selfAddress, from) - - if (conflict) { - stats = stats.incrementMergeCount - log.debug( - """Couldn't establish a causal relationship between "remote" gossip and "local" gossip - Remote[{}] - Local[{}] - merged them into [{}]""", - remoteGossip, localGossip, winningGossip) - } - - stats = stats.incrementReceivedGossipCount - notifyListeners(localGossip) - - if (envelope.conversation && - (conflict || (winningGossip ne remoteGossip) || (latestGossip ne remoteGossip))) { - // send back gossip to sender when sender had different view, i.e. merge, or sender had - // older or sender had newer - gossipTo(from) - } - } - } - } - - def mergeRate(count: Long): Double = (count * 1000.0) / GossipInterval.toMillis - - /** - * Initiates a new round of gossip. - */ - def gossip(): Unit = { - stats = stats.copy(mergeConflictCount = 0) - - log.debug("Cluster Node [{}] - Initiating new round of gossip", selfAddress) - - if (!isSingletonCluster && isAvailable) { - val localGossip = latestGossip - // important to not accidentally use `map` of the SortedSet, since the original order is not preserved - val localMembers = localGossip.members.toIndexedSeq - val localMembersSize = localMembers.size - val localMemberAddresses = localMembers map { _.address } - - val localUnreachableMembers = localGossip.overview.unreachable.toIndexedSeq - val localUnreachableSize = localUnreachableMembers.size - - // gossip to a random alive member with preference to a member - // with older or newer gossip version - val nodesWithdifferentView = { - val localMemberAddressesSet = localGossip.members map { _.address } - for { - (address, version) ← localGossip.overview.seen - if localMemberAddressesSet contains address - if version != localGossip.version - } yield address - } - val gossipedToAlive = - if (nodesWithdifferentView.nonEmpty && ThreadLocalRandom.current.nextDouble() < GossipDifferentViewProbability) - gossipToRandomNodeOf(nodesWithdifferentView.toIndexedSeq) - else - gossipToRandomNodeOf(localMemberAddresses) - - } - } - - /** - * Runs periodic leader actions, such as auto-downing unreachable nodes, assigning partitions etc. - */ - def leaderActions(): Unit = { - val localGossip = latestGossip - val localMembers = localGossip.members - - val isLeader = localMembers.nonEmpty && (selfAddress == localMembers.head.address) - - if (isLeader && isAvailable) { - // only run the leader actions if we are the LEADER and available - - val localOverview = localGossip.overview - val localSeen = localOverview.seen - val localUnreachableMembers = localOverview.unreachable - val hasPartionHandoffCompletedSuccessfully: Boolean = { - // FIXME implement partion handoff and a check if it is completed - now just returns TRUE - e.g. has completed successfully - true - } - - // Leader actions are as follows: - // 1. Move EXITING => REMOVED -- When all nodes have seen that the node is EXITING (convergence) - remove the nodes from the node ring and seen table - // 2. Move JOINING => UP -- When a node joins the cluster - // 3. Move LEAVING => EXITING -- When all partition handoff has completed - // 4. Move UNREACHABLE => DOWN -- When the node is in the UNREACHABLE set it can be auto-down by leader - // 5. Store away all stuff needed for the side-effecting processing in 10. - // 6. Updating the vclock version for the changes - // 7. Updating the 'seen' table - // 8. Try to update the state with the new gossip - // 9. If failure - retry - // 10. If success - run all the side-effecting processing - - val ( - newGossip: Gossip, - hasChangedState: Boolean, - upMembers, - exitingMembers, - removedMembers, - unreachableButNotDownedMembers) = - - if (localGossip.convergence) { - // we have convergence - so we can't have unreachable nodes - - // transform the node member ring - filterNot/map/map - val newMembers = - localMembers filterNot { member ⇒ - // ---------------------- - // 1. Move EXITING => REMOVED - e.g. remove the nodes from the 'members' set/node ring and seen table - // ---------------------- - member.status == MemberStatus.Exiting - - } map { member ⇒ - // ---------------------- - // 2. Move JOINING => UP (once all nodes have seen that this node is JOINING e.g. we have a convergence) - // ---------------------- - if (member.status == Joining) member copy (status = Up) - else member - - } map { member ⇒ - // ---------------------- - // 3. Move LEAVING => EXITING (once we have a convergence on LEAVING *and* if we have a successful partition handoff) - // ---------------------- - if (member.status == Leaving && hasPartionHandoffCompletedSuccessfully) member copy (status = Exiting) - else member - } - - // ---------------------- - // 5. Store away all stuff needed for the side-effecting processing in 10. - // ---------------------- - - // Check for the need to do side-effecting on successful state change - // Repeat the checking for transitions between JOINING -> UP, LEAVING -> EXITING, EXITING -> REMOVED - // to check for state-changes and to store away removed and exiting members for later notification - // 1. check for state-changes to update - // 2. store away removed and exiting members so we can separate the pure state changes (that can be retried on collision) and the side-effecting message sending - val (removedMembers, newMembers1) = localMembers partition (_.status == Exiting) - - val (upMembers, newMembers2) = newMembers1 partition (_.status == Joining) - - val (exitingMembers, newMembers3) = newMembers2 partition (_.status == Leaving && hasPartionHandoffCompletedSuccessfully) - - val hasChangedState = removedMembers.nonEmpty || upMembers.nonEmpty || exitingMembers.nonEmpty - - // removing REMOVED nodes from the 'seen' table - val newSeen = localSeen -- removedMembers.map(_.address) - - // removing REMOVED nodes from the 'unreachable' set - val newUnreachableMembers = localUnreachableMembers -- removedMembers - - val newOverview = localOverview copy (seen = newSeen, unreachable = newUnreachableMembers) // update gossip overview - val newGossip = localGossip copy (members = newMembers, overview = newOverview) // update gossip - - (newGossip, hasChangedState, upMembers, exitingMembers, removedMembers, Set.empty[Member]) - - } else if (AutoDown) { - // we don't have convergence - so we might have unreachable nodes - - // if 'auto-down' is turned on, then try to auto-down any unreachable nodes - val newUnreachableMembers = localUnreachableMembers.map { member ⇒ - // ---------------------- - // 5. Move UNREACHABLE => DOWN (auto-downing by leader) - // ---------------------- - if (member.status == Down) member // no need to DOWN members already DOWN - else member copy (status = Down) - } - - // Check for the need to do side-effecting on successful state change - val (unreachableButNotDownedMembers, _) = localUnreachableMembers partition (_.status != Down) - - // removing nodes marked as DOWN from the 'seen' table - val newSeen = localSeen -- newUnreachableMembers.collect { case m if m.status == Down ⇒ m.address } - - val newOverview = localOverview copy (seen = newSeen, unreachable = newUnreachableMembers) // update gossip overview - val newGossip = localGossip copy (overview = newOverview) // update gossip - - (newGossip, unreachableButNotDownedMembers.nonEmpty, Set.empty[Member], Set.empty[Member], Set.empty[Member], unreachableButNotDownedMembers) - - } else (localGossip, false, Set.empty[Member], Set.empty[Member], Set.empty[Member], Set.empty[Member]) - - if (hasChangedState) { // we have a change of state - version it and try to update - // ---------------------- - // 6. Updating the vclock version for the changes - // ---------------------- - val versionedGossip = newGossip :+ vclockNode - - // ---------------------- - // 7. Updating the 'seen' table - // Unless the leader (this node) is part of the removed members, i.e. the leader have moved himself from EXITING -> REMOVED - // ---------------------- - val seenVersionedGossip = - if (removedMembers.exists(_.address == selfAddress)) versionedGossip - else versionedGossip seen selfAddress - - // ---------------------- - // 8. Update the state with the new gossip - // ---------------------- - latestGossip = seenVersionedGossip - - // ---------------------- - // 9. Run all the side-effecting processing - // ---------------------- - - // log the move of members from joining to up - upMembers foreach { member ⇒ log.info("Cluster Node [{}] - Leader is moving node [{}] from JOINING to UP", selfAddress, member.address) } - - // tell all removed members to remove and shut down themselves - removedMembers foreach { member ⇒ - val address = member.address - log.info("Cluster Node [{}] - Leader is moving node [{}] from EXITING to REMOVED - and removing node from node ring", selfAddress, address) - coreSender ! SendClusterMessage( - to = address, - msg = ClusterLeaderAction.Remove(address)) - } - - // tell all exiting members to exit - exitingMembers foreach { member ⇒ - val address = member.address - log.info("Cluster Node [{}] - Leader is moving node [{}] from LEAVING to EXITING", selfAddress, address) - coreSender ! SendClusterMessage( - to = address, - msg = ClusterLeaderAction.Exit(address)) // FIXME should use ? to await completion of handoff? - } - - // log the auto-downing of the unreachable nodes - unreachableButNotDownedMembers foreach { member ⇒ - log.info("Cluster Node [{}] - Leader is marking unreachable node [{}] as DOWN", selfAddress, member.address) - } - - notifyListeners(localGossip) - } - } - } - - def heartbeat(): Unit = { - removeOverdueJoinInProgress() - - val beatTo = latestGossip.members.toSeq.map(_.address) ++ joinInProgress.keys - - val deadline = Deadline.now + HeartbeatInterval - for (address ← beatTo; if address != selfAddress) - heartbeatSender ! SendHeartbeat(selfHeartbeat, address, deadline) - } - - /** - * Removes overdue joinInProgress from State. - */ - def removeOverdueJoinInProgress(): Unit = { - val overdueJoins = joinInProgress collect { - case (address, deadline) if deadline.isOverdue ⇒ address - } - if (overdueJoins.nonEmpty) { - joinInProgress = joinInProgress -- overdueJoins - } - } - - /** - * Reaps the unreachable members (moves them to the 'unreachable' list in the cluster overview) according to the failure detector's verdict. - */ - def reapUnreachableMembers(): Unit = { - - if (!isSingletonCluster && isAvailable) { - // only scrutinize if we are a non-singleton cluster and available - - val localGossip = latestGossip - val localOverview = localGossip.overview - val localMembers = localGossip.members - val localUnreachableMembers = localGossip.overview.unreachable - - val newlyDetectedUnreachableMembers = localMembers filterNot { member ⇒ - member.address == selfAddress || cluster.failureDetector.isAvailable(member.address) - } - - if (newlyDetectedUnreachableMembers.nonEmpty) { - - val newMembers = localMembers -- newlyDetectedUnreachableMembers - val newUnreachableMembers = localUnreachableMembers ++ newlyDetectedUnreachableMembers - - val newOverview = localOverview copy (unreachable = newUnreachableMembers) - val newGossip = localGossip copy (overview = newOverview, members = newMembers) - - // updating vclock and 'seen' table - val versionedGossip = newGossip :+ vclockNode - val seenVersionedGossip = versionedGossip seen selfAddress - - latestGossip = seenVersionedGossip - - log.error("Cluster Node [{}] - Marking node(s) as UNREACHABLE [{}]", selfAddress, newlyDetectedUnreachableMembers.mkString(", ")) - - notifyListeners(localGossip) - } - } - } - - def seedNodes: IndexedSeq[Address] = cluster.seedNodes - - def selectRandomNode(addresses: IndexedSeq[Address]): Option[Address] = - if (addresses.isEmpty) None - else Some(addresses(ThreadLocalRandom.current nextInt addresses.size)) - - def isSingletonCluster: Boolean = latestGossip.isSingletonCluster - - def isAvailable: Boolean = latestGossip.isAvailable(selfAddress) - - /** - * Gossips latest gossip to a random member in the set of members passed in as argument. - * - * @return the used [[akka.actor.Address] if any - */ - private def gossipToRandomNodeOf(addresses: IndexedSeq[Address]): Option[Address] = { - log.debug("Cluster Node [{}] - Selecting random node to gossip to [{}]", selfAddress, addresses.mkString(", ")) - val peers = addresses filterNot (_ == selfAddress) // filter out myself - val peer = selectRandomNode(peers) - peer foreach gossipTo - peer - } - - /** - * Gossips latest gossip to an address. - */ - def gossipTo(address: Address): Unit = - gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = true)) - - def oneWayGossipTo(address: Address): Unit = - gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = false)) - - def gossipTo(address: Address, gossipMsg: GossipEnvelope): Unit = if (address != selfAddress) { - coreSender ! SendClusterMessage(address, gossipMsg) - } - - def notifyListeners(oldGossip: Gossip): Unit = { - if (PublishStateInterval == Duration.Zero) publishState() - - val oldMembersStatus = oldGossip.members.map(m ⇒ (m.address, m.status)) - val newMembersStatus = latestGossip.members.map(m ⇒ (m.address, m.status)) - if (newMembersStatus != oldMembersStatus) - cluster notifyMembershipChangeListeners latestGossip.members - } - - def publishState(): Unit = { - cluster._latestGossip = latestGossip - cluster._latestStats = stats - } - - def ping(p: Ping): Unit = sender ! Pong(p) -} - -/** - * INTERNAL API. - * - * Supervisor managing the different Cluster daemons. - */ -private[cluster] final class ClusterDaemonSupervisor(cluster: Cluster) extends Actor with ActorLogging { - - val configuredDispatcher = cluster.settings.UseDispatcher - val core = context.actorOf(Props(new ClusterCore(cluster)). - withDispatcher(configuredDispatcher), name = "core") - val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(cluster)). - withDispatcher(configuredDispatcher), name = "heartbeat") - - def receive = { - case InternalClusterAction.GetClusterCoreRef ⇒ sender ! core - } - -} /** * Cluster Extension Id and factory for creating Cluster extension. @@ -1387,7 +73,7 @@ object Cluster extends ExtensionId[Cluster] with ExtensionIdProvider { * if (Cluster(system).isLeader) { ... } * }}} */ -class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) extends Extension { clusterNode ⇒ +class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) extends Extension with ClusterEnvironment { /** * Represents the state for this Cluster. Implemented using optimistic lockless concurrency. @@ -1400,7 +86,6 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) private val remote: RemoteActorRefProvider = system.provider.asInstanceOf[RemoteActorRefProvider] - val remoteSettings = new RemoteSettings(system.settings.config, system.name) val settings = new ClusterSettings(system.settings.config, system.name) import settings._ @@ -1415,7 +100,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) /** * Read only view of cluster state, updated periodically by - * ClusterCore. Access with `latestGossip`. + * ClusterCoreDaemon. Access with `latestGossip`. */ @volatile private[cluster] var _latestGossip: Gossip = Gossip() @@ -1423,7 +108,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) /** * INTERNAL API * Read only view of internal cluster stats, updated periodically by - * ClusterCore. Access with `latestStats`. + * ClusterCoreDaemon. Access with `latestStats`. */ @volatile private[cluster] var _latestStats = ClusterStats() @@ -1435,8 +120,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) /** * INTERNAL API */ - private[cluster] val clusterScheduler: Scheduler with Closeable = { - // FIXME consider moving clusterScheduler to ClusterCore actor + private[cluster] val scheduler: Scheduler with Closeable = { if (system.settings.SchedulerTickDuration > SchedulerTickDuration) { log.info("Using a dedicated scheduler for cluster. Default scheduler can be used if configured " + "with 'akka.scheduler.tick-duration' [{} ms] <= 'akka.cluster.scheduler.tick-duration' [{} ms].", @@ -1473,7 +157,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) // create supervisor for daemons under path "/system/cluster" private val clusterDaemons: ActorRef = { - system.asInstanceOf[ActorSystemImpl].systemActorOf(Props(new ClusterDaemonSupervisor(this)). + system.asInstanceOf[ActorSystemImpl].systemActorOf(Props(new ClusterDaemon(this)). withDispatcher(UseDispatcher), name = "cluster") } @@ -1618,7 +302,7 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) if (!clusterDaemons.isTerminated) system.stop(clusterDaemons) - clusterScheduler.close() + scheduler.close() clusterJmx.unregisterMBean() @@ -1639,4 +323,28 @@ class Cluster(system: ExtendedActorSystem, val failureDetector: FailureDetector) */ private[cluster] def latestStats: ClusterStats = _latestStats + /** + * INTERNAL API + */ + private[cluster] def publishLatestGossip(gossip: Gossip): Unit = _latestGossip = gossip + + /** + * INTERNAL API + */ + private[cluster] def publishLatestStats(stats: ClusterStats): Unit = _latestStats = stats + } + +/** + * Interface for membership change listener. + */ +trait MembershipChangeListener { + def notify(members: SortedSet[Member]): Unit +} + +/** + * Interface for meta data change listener. + */ +trait MetaDataChangeListener { + def notify(meta: Map[String, Array[Byte]]): Unit +} \ No newline at end of file diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala new file mode 100644 index 0000000000..95fab750ca --- /dev/null +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala @@ -0,0 +1,926 @@ +/** + * Copyright (C) 2009-2012 Typesafe Inc. + */ +package akka.cluster + +import scala.collection.immutable.SortedSet + +import akka.actor.Actor +import akka.actor.ActorLogging +import akka.actor.ActorRef +import akka.actor.Address +import akka.actor.Cancellable +import akka.actor.Props +import akka.actor.RootActorPath +import akka.actor.Status.Failure +import akka.actor.PoisonPill +import akka.actor.Scheduler +import akka.routing.ScatterGatherFirstCompletedRouter +import akka.util.Deadline +import akka.util.Duration +import akka.util.Timeout +import akka.jsr166y.ThreadLocalRandom +import akka.pattern.AskTimeoutException +import akka.pattern.ask +import akka.pattern.pipe +import MemberStatus._ + +/** + * Base trait for all cluster messages. All ClusterMessage's are serializable. + * + * FIXME Protobuf all ClusterMessages + */ +trait ClusterMessage extends Serializable + +/** + * Cluster commands sent by the USER. + */ +object ClusterUserAction { + + /** + * Command to join the cluster. Sent when a node (represented by 'address') + * wants to join another node (the receiver). + */ + case class Join(address: Address) extends ClusterMessage + + /** + * Command to leave the cluster. + */ + case class Leave(address: Address) extends ClusterMessage + + /** + * Command to mark node as temporary down. + */ + case class Down(address: Address) extends ClusterMessage + +} + +/** + * INTERNAL API + */ +private[cluster] object InternalClusterAction { + + /** + * Command to initiate join another node (represented by 'address'). + * Join will be sent to the other node. + */ + case class JoinTo(address: Address) extends ClusterMessage + + /** + * Start message of the process to join one of the seed nodes. + * The node sends `InitJoin` to all seed nodes, which replies + * with `InitJoinAck`. The first reply is used others are discarded. + * The node sends `Join` command to the seed node that replied first. + */ + case object JoinSeedNode extends ClusterMessage + + /** + * @see JoinSeedNode + */ + case object InitJoin extends ClusterMessage + + /** + * @see JoinSeedNode + */ + case class InitJoinAck(address: Address) extends ClusterMessage + + case object GossipTick + + case object HeartbeatTick + + case object ReapUnreachableTick + + case object LeaderActionsTick + + case object PublishStateTick + + case class SendClusterMessage(to: Address, msg: ClusterMessage) + + case class SendGossipTo(address: Address) + + case object GetClusterCoreRef + + case class Ping(timestamp: Long = System.currentTimeMillis) extends ClusterMessage + case class Pong(ping: Ping, timestamp: Long = System.currentTimeMillis) extends ClusterMessage + +} + +/** + * INTERNAL API. + * + * Cluster commands sent by the LEADER. + */ +private[cluster] object ClusterLeaderAction { + + /** + * Command to mark a node to be removed from the cluster immediately. + * Can only be sent by the leader. + */ + case class Exit(address: Address) extends ClusterMessage + + /** + * Command to remove a node from the cluster immediately. + */ + case class Remove(address: Address) extends ClusterMessage +} + +/** + * INTERNAL API + * + * The contextual pieces that ClusterDaemon actors need. + * Makes it easier to test the actors without using the Cluster extension. + */ +private[cluster] trait ClusterEnvironment { + private[cluster] def settings: ClusterSettings + private[cluster] def failureDetector: FailureDetector + private[cluster] def selfAddress: Address + private[cluster] def scheduler: Scheduler + private[cluster] def seedNodes: IndexedSeq[Address] + private[cluster] def notifyMembershipChangeListeners(members: SortedSet[Member]): Unit + private[cluster] def publishLatestGossip(gossip: Gossip): Unit + private[cluster] def publishLatestStats(stats: ClusterStats): Unit + private[cluster] def shutdown(): Unit +} + +/** + * INTERNAL API. + * + * Supervisor managing the different Cluster daemons. + */ +private[cluster] final class ClusterDaemon(environment: ClusterEnvironment) extends Actor with ActorLogging { + + val configuredDispatcher = environment.settings.UseDispatcher + val core = context.actorOf(Props(new ClusterCoreDaemon(environment)). + withDispatcher(configuredDispatcher), name = "core") + val heartbeat = context.actorOf(Props(new ClusterHeartbeatDaemon(environment)). + withDispatcher(configuredDispatcher), name = "heartbeat") + + def receive = { + case InternalClusterAction.GetClusterCoreRef ⇒ sender ! core + } + +} + +/** + * INTERNAL API. + */ +private[cluster] final class ClusterCoreDaemon(environment: ClusterEnvironment) extends Actor with ActorLogging { + // FIXME break up the cluster constructor parameter into something that is easier to test without Cluster + import ClusterLeaderAction._ + import InternalClusterAction._ + import ClusterHeartbeatSender._ + + def selfAddress = environment.selfAddress + def clusterScheduler = environment.scheduler + def failureDetector = environment.failureDetector + val settings = environment.settings + import settings._ + + val vclockNode = VectorClock.Node(selfAddress.toString) + val selfHeartbeat = Heartbeat(selfAddress) + + // note that self is not initially member, + // and the Gossip is not versioned for this 'Node' yet + var latestGossip: Gossip = Gossip() + var joinInProgress: Map[Address, Deadline] = Map.empty + + var stats = ClusterStats() + + val heartbeatSender = context.actorOf(Props(new ClusterHeartbeatSender(environment)). + withDispatcher(UseDispatcher), name = "heartbeatSender") + val coreSender = context.actorOf(Props(new ClusterCoreSender(selfAddress)). + withDispatcher(UseDispatcher), name = "coreSender") + + // start periodic gossip to random nodes in cluster + val gossipTask = + FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(GossipInterval), GossipInterval) { + self ! GossipTick + } + + // start periodic heartbeat to all nodes in cluster + val heartbeatTask = + FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(HeartbeatInterval), HeartbeatInterval) { + self ! HeartbeatTick + } + + // start periodic cluster failure detector reaping (moving nodes condemned by the failure detector to unreachable list) + val failureDetectorReaperTask = + FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(UnreachableNodesReaperInterval), UnreachableNodesReaperInterval) { + self ! ReapUnreachableTick + } + + // start periodic leader action management (only applies for the current leader) + private val leaderActionsTask = + FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(LeaderActionsInterval), LeaderActionsInterval) { + self ! LeaderActionsTick + } + + // start periodic publish of current state + private val publishStateTask: Option[Cancellable] = + if (PublishStateInterval == Duration.Zero) None + else Some(FixedRateTask(clusterScheduler, PeriodicTasksInitialDelay.max(PublishStateInterval), PublishStateInterval) { + self ! PublishStateTick + }) + + override def preStart(): Unit = { + if (AutoJoin) self ! InternalClusterAction.JoinSeedNode + } + + override def postStop(): Unit = { + gossipTask.cancel() + heartbeatTask.cancel() + failureDetectorReaperTask.cancel() + leaderActionsTask.cancel() + publishStateTask foreach { _.cancel() } + } + + def receive = { + case msg: GossipEnvelope ⇒ receiveGossip(msg) + case msg: GossipMergeConflict ⇒ receiveGossipMerge(msg) + case GossipTick ⇒ gossip() + case HeartbeatTick ⇒ heartbeat() + case ReapUnreachableTick ⇒ reapUnreachableMembers() + case LeaderActionsTick ⇒ leaderActions() + case PublishStateTick ⇒ publishState() + case JoinSeedNode ⇒ joinSeedNode() + case InitJoin ⇒ initJoin() + case InitJoinAck(address) ⇒ join(address) + case Failure(e: AskTimeoutException) ⇒ joinSeedNodeTimeout() + case JoinTo(address) ⇒ join(address) + case ClusterUserAction.Join(address) ⇒ joining(address) + case ClusterUserAction.Down(address) ⇒ downing(address) + case ClusterUserAction.Leave(address) ⇒ leaving(address) + case Exit(address) ⇒ exiting(address) + case Remove(address) ⇒ removing(address) + case SendGossipTo(address) ⇒ gossipTo(address) + case p: Ping ⇒ ping(p) + + } + + def joinSeedNode(): Unit = { + val seedRoutees = environment.seedNodes.collect { case a if a != selfAddress ⇒ self.path.toStringWithAddress(a) } + if (seedRoutees.isEmpty) { + join(selfAddress) + } else { + implicit val within = Timeout(SeedNodeTimeout) + val seedRouter = context.actorOf( + Props.empty.withRouter(ScatterGatherFirstCompletedRouter( + routees = seedRoutees, within = within.duration))) + seedRouter ? InitJoin pipeTo self + seedRouter ! PoisonPill + } + } + + def initJoin(): Unit = sender ! InitJoinAck(selfAddress) + + def joinSeedNodeTimeout(): Unit = join(selfAddress) + + /** + * Try to join this cluster node with the node specified by 'address'. + * A 'Join(thisNodeAddress)' command is sent to the node to join. + */ + def join(address: Address): Unit = { + val localGossip = latestGossip + // wipe our state since a node that joins a cluster must be empty + latestGossip = Gossip() + joinInProgress = Map(address -> (Deadline.now + JoinTimeout)) + + // wipe the failure detector since we are starting fresh and shouldn't care about the past + failureDetector.reset() + + notifyListeners(localGossip) + + val command = ClusterUserAction.Join(selfAddress) + coreSender ! SendClusterMessage(address, command) + } + + /** + * State transition to JOINING - new node joining. + */ + def joining(node: Address): Unit = { + val localGossip = latestGossip + val localMembers = localGossip.members + val localUnreachable = localGossip.overview.unreachable + + val alreadyMember = localMembers.exists(_.address == node) + val isUnreachable = localGossip.overview.isNonDownUnreachable(node) + + if (!alreadyMember && !isUnreachable) { + + // remove the node from the 'unreachable' set in case it is a DOWN node that is rejoining cluster + val (rejoiningMember, newUnreachableMembers) = localUnreachable partition { _.address == node } + val newOverview = localGossip.overview copy (unreachable = newUnreachableMembers) + + // remove the node from the failure detector if it is a DOWN node that is rejoining cluster + if (rejoiningMember.nonEmpty) failureDetector.remove(node) + + // add joining node as Joining + // add self in case someone else joins before self has joined (Set discards duplicates) + val newMembers = localMembers :+ Member(node, Joining) :+ Member(selfAddress, Joining) + val newGossip = localGossip copy (overview = newOverview, members = newMembers) + + val versionedGossip = newGossip :+ vclockNode + val seenVersionedGossip = versionedGossip seen selfAddress + + latestGossip = seenVersionedGossip + + log.debug("Cluster Node [{}] - Node [{}] is JOINING", selfAddress, node) + // treat join as initial heartbeat, so that it becomes unavailable if nothing more happens + if (node != selfAddress) { + failureDetector heartbeat node + gossipTo(node) + } + + notifyListeners(localGossip) + } + } + + /** + * State transition to LEAVING. + */ + def leaving(address: Address): Unit = { + val localGossip = latestGossip + if (localGossip.members.exists(_.address == address)) { // only try to update if the node is available (in the member ring) + val newMembers = localGossip.members map { member ⇒ if (member.address == address) Member(address, Leaving) else member } // mark node as LEAVING + val newGossip = localGossip copy (members = newMembers) + + val versionedGossip = newGossip :+ vclockNode + val seenVersionedGossip = versionedGossip seen selfAddress + + latestGossip = seenVersionedGossip + + log.info("Cluster Node [{}] - Marked address [{}] as LEAVING", selfAddress, address) + notifyListeners(localGossip) + } + } + + /** + * State transition to EXITING. + */ + def exiting(address: Address): Unit = { + log.info("Cluster Node [{}] - Marked node [{}] as EXITING", selfAddress, address) + // FIXME implement when we implement hand-off + } + + /** + * State transition to REMOVED. + * + * This method is for now only called after the LEADER have sent a Removed message - telling the node + * to shut down himself. + * + * In the future we might change this to allow the USER to send a Removed(address) message telling an + * arbitrary node to be moved direcly from UP -> REMOVED. + */ + def removing(address: Address): Unit = { + log.info("Cluster Node [{}] - Node has been REMOVED by the leader - shutting down...", selfAddress) + val localGossip = latestGossip + // just cleaning up the gossip state + latestGossip = Gossip() + // make sure the final (removed) state is always published + notifyListeners(localGossip) + environment.shutdown() + } + + /** + * The node to DOWN is removed from the 'members' set and put in the 'unreachable' set (if not already there) + * and its status is set to DOWN. The node is also removed from the 'seen' table. + * + * The node will reside as DOWN in the 'unreachable' set until an explicit command JOIN command is sent directly + * to this node and it will then go through the normal JOINING procedure. + */ + def downing(address: Address): Unit = { + val localGossip = latestGossip + val localMembers = localGossip.members + val localOverview = localGossip.overview + val localSeen = localOverview.seen + val localUnreachableMembers = localOverview.unreachable + + // 1. check if the node to DOWN is in the 'members' set + val downedMember: Option[Member] = localMembers.collectFirst { + case m if m.address == address ⇒ m.copy(status = Down) + } + val newMembers = downedMember match { + case Some(m) ⇒ + log.info("Cluster Node [{}] - Marking node [{}] as DOWN", selfAddress, m.address) + localMembers - m + case None ⇒ localMembers + } + + // 2. check if the node to DOWN is in the 'unreachable' set + val newUnreachableMembers = + localUnreachableMembers.map { member ⇒ + // no need to DOWN members already DOWN + if (member.address == address && member.status != Down) { + log.info("Cluster Node [{}] - Marking unreachable node [{}] as DOWN", selfAddress, member.address) + member copy (status = Down) + } else member + } + + // 3. add the newly DOWNED members from the 'members' (in step 1.) to the 'newUnreachableMembers' set. + val newUnreachablePlusNewlyDownedMembers = newUnreachableMembers ++ downedMember + + // 4. remove nodes marked as DOWN from the 'seen' table + val newSeen = localSeen -- newUnreachablePlusNewlyDownedMembers.collect { + case m if m.status == Down ⇒ m.address + } + + // update gossip overview + val newOverview = localOverview copy (seen = newSeen, unreachable = newUnreachablePlusNewlyDownedMembers) + val newGossip = localGossip copy (overview = newOverview, members = newMembers) // update gossip + val versionedGossip = newGossip :+ vclockNode + latestGossip = versionedGossip seen selfAddress + + notifyListeners(localGossip) + } + + /** + * When conflicting versions of received and local [[akka.cluster.Gossip]] is detected + * it's forwarded to the leader for conflict resolution. Trying to simultaneously + * resolving conflicts at several nodes creates new conflicts. Therefore the leader resolves + * conflicts to limit divergence. To avoid overload there is also a configurable rate + * limit of how many conflicts that are handled by second. If the limit is + * exceeded the conflicting gossip messages are dropped and will reappear later. + */ + def receiveGossipMerge(merge: GossipMergeConflict): Unit = { + stats = stats.incrementMergeConflictCount + val rate = mergeRate(stats.mergeConflictCount) + if (rate <= MaxGossipMergeRate) { + receiveGossip(merge.a.copy(conversation = false)) + receiveGossip(merge.b.copy(conversation = false)) + + // use one-way gossip from leader to reduce load of leader + def sendBack(to: Address): Unit = { + if (to != selfAddress && !latestGossip.overview.unreachable.exists(_.address == to)) + oneWayGossipTo(to) + } + + sendBack(merge.a.from) + sendBack(merge.b.from) + + } else { + log.debug("Dropping gossip merge conflict due to rate [{}] / s ", rate) + } + } + + /** + * Receive new gossip. + */ + def receiveGossip(envelope: GossipEnvelope): Unit = { + val from = envelope.from + val remoteGossip = envelope.gossip + val localGossip = latestGossip + + if (remoteGossip.overview.unreachable.exists(_.address == selfAddress)) { + // FIXME how should we handle this situation? + log.debug("Received gossip with self as unreachable, from [{}]", from) + + } else if (!localGossip.overview.isNonDownUnreachable(from)) { + + // leader handles merge conflicts, or when they have different views of how is leader + val handleMerge = localGossip.leader == Some(selfAddress) || localGossip.leader != remoteGossip.leader + val conflict = remoteGossip.version <> localGossip.version + + if (conflict && !handleMerge) { + // delegate merge resolution to leader to reduce number of simultaneous resolves, + // which will result in new conflicts + + stats = stats.incrementMergeDetectedCount + log.debug("Merge conflict [{}] detected [{}] <> [{}]", stats.mergeDetectedCount, selfAddress, from) + + stats = stats.incrementMergeConflictCount + val rate = mergeRate(stats.mergeConflictCount) + if (rate <= MaxGossipMergeRate) { + coreSender ! SendClusterMessage( + to = localGossip.leader.get, + msg = GossipMergeConflict(GossipEnvelope(selfAddress, localGossip), envelope)) + } else { + log.debug("Skipping gossip merge conflict due to rate [{}] / s ", rate) + } + + } else { + + val winningGossip = + + if (conflict) { + // conflicting versions, merge, and new version + val mergedGossip = remoteGossip merge localGossip + mergedGossip :+ vclockNode + + } else if (remoteGossip.version < localGossip.version) { + // local gossip is newer + localGossip + + } else if (!remoteGossip.members.exists(_.address == selfAddress)) { + // FIXME This is a very strange. It can happen when many nodes join at the same time. + // It's not detected as an ordinary version conflict <> + // If we don't handle this situation there will be IllegalArgumentException when marking this as seen + // merge, and new version + val mergedGossip = remoteGossip merge (localGossip :+ Member(selfAddress, Joining)) + mergedGossip :+ vclockNode + + } else { + // remote gossip is newer + remoteGossip + + } + + val newJoinInProgress = + if (joinInProgress.isEmpty) joinInProgress + else joinInProgress -- + winningGossip.members.map(_.address) -- + winningGossip.overview.unreachable.map(_.address) + + latestGossip = winningGossip seen selfAddress + joinInProgress = newJoinInProgress + + // for all new joining nodes we remove them from the failure detector + (latestGossip.members -- localGossip.members).filter(_.status == Joining).foreach { node ⇒ + failureDetector.remove(node.address) + } + + log.debug("Cluster Node [{}] - Receiving gossip from [{}]", selfAddress, from) + + if (conflict) { + stats = stats.incrementMergeCount + log.debug( + """Couldn't establish a causal relationship between "remote" gossip and "local" gossip - Remote[{}] - Local[{}] - merged them into [{}]""", + remoteGossip, localGossip, winningGossip) + } + + stats = stats.incrementReceivedGossipCount + notifyListeners(localGossip) + + if (envelope.conversation && + (conflict || (winningGossip ne remoteGossip) || (latestGossip ne remoteGossip))) { + // send back gossip to sender when sender had different view, i.e. merge, or sender had + // older or sender had newer + gossipTo(from) + } + } + } + } + + def mergeRate(count: Long): Double = (count * 1000.0) / GossipInterval.toMillis + + /** + * Initiates a new round of gossip. + */ + def gossip(): Unit = { + stats = stats.copy(mergeConflictCount = 0) + + log.debug("Cluster Node [{}] - Initiating new round of gossip", selfAddress) + + if (!isSingletonCluster && isAvailable) { + val localGossip = latestGossip + // important to not accidentally use `map` of the SortedSet, since the original order is not preserved + val localMembers = localGossip.members.toIndexedSeq + val localMembersSize = localMembers.size + val localMemberAddresses = localMembers map { _.address } + + val localUnreachableMembers = localGossip.overview.unreachable.toIndexedSeq + val localUnreachableSize = localUnreachableMembers.size + + // gossip to a random alive member with preference to a member + // with older or newer gossip version + val nodesWithdifferentView = { + val localMemberAddressesSet = localGossip.members map { _.address } + for { + (address, version) ← localGossip.overview.seen + if localMemberAddressesSet contains address + if version != localGossip.version + } yield address + } + val gossipedToAlive = + if (nodesWithdifferentView.nonEmpty && ThreadLocalRandom.current.nextDouble() < GossipDifferentViewProbability) + gossipToRandomNodeOf(nodesWithdifferentView.toIndexedSeq) + else + gossipToRandomNodeOf(localMemberAddresses) + + } + } + + /** + * Runs periodic leader actions, such as auto-downing unreachable nodes, assigning partitions etc. + */ + def leaderActions(): Unit = { + val localGossip = latestGossip + val localMembers = localGossip.members + + val isLeader = localMembers.nonEmpty && (selfAddress == localMembers.head.address) + + if (isLeader && isAvailable) { + // only run the leader actions if we are the LEADER and available + + val localOverview = localGossip.overview + val localSeen = localOverview.seen + val localUnreachableMembers = localOverview.unreachable + val hasPartionHandoffCompletedSuccessfully: Boolean = { + // FIXME implement partion handoff and a check if it is completed - now just returns TRUE - e.g. has completed successfully + true + } + + // Leader actions are as follows: + // 1. Move EXITING => REMOVED -- When all nodes have seen that the node is EXITING (convergence) - remove the nodes from the node ring and seen table + // 2. Move JOINING => UP -- When a node joins the cluster + // 3. Move LEAVING => EXITING -- When all partition handoff has completed + // 4. Move UNREACHABLE => DOWN -- When the node is in the UNREACHABLE set it can be auto-down by leader + // 5. Store away all stuff needed for the side-effecting processing in 10. + // 6. Updating the vclock version for the changes + // 7. Updating the 'seen' table + // 8. Try to update the state with the new gossip + // 9. If failure - retry + // 10. If success - run all the side-effecting processing + + val ( + newGossip: Gossip, + hasChangedState: Boolean, + upMembers, + exitingMembers, + removedMembers, + unreachableButNotDownedMembers) = + + if (localGossip.convergence) { + // we have convergence - so we can't have unreachable nodes + + // transform the node member ring - filterNot/map/map + val newMembers = + localMembers filterNot { member ⇒ + // ---------------------- + // 1. Move EXITING => REMOVED - e.g. remove the nodes from the 'members' set/node ring and seen table + // ---------------------- + member.status == MemberStatus.Exiting + + } map { member ⇒ + // ---------------------- + // 2. Move JOINING => UP (once all nodes have seen that this node is JOINING e.g. we have a convergence) + // ---------------------- + if (member.status == Joining) member copy (status = Up) + else member + + } map { member ⇒ + // ---------------------- + // 3. Move LEAVING => EXITING (once we have a convergence on LEAVING *and* if we have a successful partition handoff) + // ---------------------- + if (member.status == Leaving && hasPartionHandoffCompletedSuccessfully) member copy (status = Exiting) + else member + } + + // ---------------------- + // 5. Store away all stuff needed for the side-effecting processing in 10. + // ---------------------- + + // Check for the need to do side-effecting on successful state change + // Repeat the checking for transitions between JOINING -> UP, LEAVING -> EXITING, EXITING -> REMOVED + // to check for state-changes and to store away removed and exiting members for later notification + // 1. check for state-changes to update + // 2. store away removed and exiting members so we can separate the pure state changes (that can be retried on collision) and the side-effecting message sending + val (removedMembers, newMembers1) = localMembers partition (_.status == Exiting) + + val (upMembers, newMembers2) = newMembers1 partition (_.status == Joining) + + val (exitingMembers, newMembers3) = newMembers2 partition (_.status == Leaving && hasPartionHandoffCompletedSuccessfully) + + val hasChangedState = removedMembers.nonEmpty || upMembers.nonEmpty || exitingMembers.nonEmpty + + // removing REMOVED nodes from the 'seen' table + val newSeen = localSeen -- removedMembers.map(_.address) + + // removing REMOVED nodes from the 'unreachable' set + val newUnreachableMembers = localUnreachableMembers -- removedMembers + + val newOverview = localOverview copy (seen = newSeen, unreachable = newUnreachableMembers) // update gossip overview + val newGossip = localGossip copy (members = newMembers, overview = newOverview) // update gossip + + (newGossip, hasChangedState, upMembers, exitingMembers, removedMembers, Set.empty[Member]) + + } else if (AutoDown) { + // we don't have convergence - so we might have unreachable nodes + + // if 'auto-down' is turned on, then try to auto-down any unreachable nodes + val newUnreachableMembers = localUnreachableMembers.map { member ⇒ + // ---------------------- + // 5. Move UNREACHABLE => DOWN (auto-downing by leader) + // ---------------------- + if (member.status == Down) member // no need to DOWN members already DOWN + else member copy (status = Down) + } + + // Check for the need to do side-effecting on successful state change + val (unreachableButNotDownedMembers, _) = localUnreachableMembers partition (_.status != Down) + + // removing nodes marked as DOWN from the 'seen' table + val newSeen = localSeen -- newUnreachableMembers.collect { case m if m.status == Down ⇒ m.address } + + val newOverview = localOverview copy (seen = newSeen, unreachable = newUnreachableMembers) // update gossip overview + val newGossip = localGossip copy (overview = newOverview) // update gossip + + (newGossip, unreachableButNotDownedMembers.nonEmpty, Set.empty[Member], Set.empty[Member], Set.empty[Member], unreachableButNotDownedMembers) + + } else (localGossip, false, Set.empty[Member], Set.empty[Member], Set.empty[Member], Set.empty[Member]) + + if (hasChangedState) { // we have a change of state - version it and try to update + // ---------------------- + // 6. Updating the vclock version for the changes + // ---------------------- + val versionedGossip = newGossip :+ vclockNode + + // ---------------------- + // 7. Updating the 'seen' table + // Unless the leader (this node) is part of the removed members, i.e. the leader have moved himself from EXITING -> REMOVED + // ---------------------- + val seenVersionedGossip = + if (removedMembers.exists(_.address == selfAddress)) versionedGossip + else versionedGossip seen selfAddress + + // ---------------------- + // 8. Update the state with the new gossip + // ---------------------- + latestGossip = seenVersionedGossip + + // ---------------------- + // 9. Run all the side-effecting processing + // ---------------------- + + // log the move of members from joining to up + upMembers foreach { member ⇒ log.info("Cluster Node [{}] - Leader is moving node [{}] from JOINING to UP", selfAddress, member.address) } + + // tell all removed members to remove and shut down themselves + removedMembers foreach { member ⇒ + val address = member.address + log.info("Cluster Node [{}] - Leader is moving node [{}] from EXITING to REMOVED - and removing node from node ring", selfAddress, address) + coreSender ! SendClusterMessage( + to = address, + msg = ClusterLeaderAction.Remove(address)) + } + + // tell all exiting members to exit + exitingMembers foreach { member ⇒ + val address = member.address + log.info("Cluster Node [{}] - Leader is moving node [{}] from LEAVING to EXITING", selfAddress, address) + coreSender ! SendClusterMessage( + to = address, + msg = ClusterLeaderAction.Exit(address)) // FIXME should use ? to await completion of handoff? + } + + // log the auto-downing of the unreachable nodes + unreachableButNotDownedMembers foreach { member ⇒ + log.info("Cluster Node [{}] - Leader is marking unreachable node [{}] as DOWN", selfAddress, member.address) + } + + notifyListeners(localGossip) + } + } + } + + def heartbeat(): Unit = { + removeOverdueJoinInProgress() + + val beatTo = latestGossip.members.toSeq.map(_.address) ++ joinInProgress.keys + + val deadline = Deadline.now + HeartbeatInterval + for (address ← beatTo; if address != selfAddress) + heartbeatSender ! SendHeartbeat(selfHeartbeat, address, deadline) + } + + /** + * Removes overdue joinInProgress from State. + */ + def removeOverdueJoinInProgress(): Unit = { + val overdueJoins = joinInProgress collect { + case (address, deadline) if deadline.isOverdue ⇒ address + } + if (overdueJoins.nonEmpty) { + joinInProgress = joinInProgress -- overdueJoins + } + } + + /** + * Reaps the unreachable members (moves them to the 'unreachable' list in the cluster overview) according to the failure detector's verdict. + */ + def reapUnreachableMembers(): Unit = { + + if (!isSingletonCluster && isAvailable) { + // only scrutinize if we are a non-singleton cluster and available + + val localGossip = latestGossip + val localOverview = localGossip.overview + val localMembers = localGossip.members + val localUnreachableMembers = localGossip.overview.unreachable + + val newlyDetectedUnreachableMembers = localMembers filterNot { member ⇒ + member.address == selfAddress || failureDetector.isAvailable(member.address) + } + + if (newlyDetectedUnreachableMembers.nonEmpty) { + + val newMembers = localMembers -- newlyDetectedUnreachableMembers + val newUnreachableMembers = localUnreachableMembers ++ newlyDetectedUnreachableMembers + + val newOverview = localOverview copy (unreachable = newUnreachableMembers) + val newGossip = localGossip copy (overview = newOverview, members = newMembers) + + // updating vclock and 'seen' table + val versionedGossip = newGossip :+ vclockNode + val seenVersionedGossip = versionedGossip seen selfAddress + + latestGossip = seenVersionedGossip + + log.error("Cluster Node [{}] - Marking node(s) as UNREACHABLE [{}]", selfAddress, newlyDetectedUnreachableMembers.mkString(", ")) + + notifyListeners(localGossip) + } + } + } + + def seedNodes: IndexedSeq[Address] = environment.seedNodes + + def selectRandomNode(addresses: IndexedSeq[Address]): Option[Address] = + if (addresses.isEmpty) None + else Some(addresses(ThreadLocalRandom.current nextInt addresses.size)) + + def isSingletonCluster: Boolean = latestGossip.isSingletonCluster + + def isAvailable: Boolean = latestGossip.isAvailable(selfAddress) + + /** + * Gossips latest gossip to a random member in the set of members passed in as argument. + * + * @return the used [[akka.actor.Address] if any + */ + private def gossipToRandomNodeOf(addresses: IndexedSeq[Address]): Option[Address] = { + log.debug("Cluster Node [{}] - Selecting random node to gossip to [{}]", selfAddress, addresses.mkString(", ")) + // filter out myself + val peer = selectRandomNode(addresses filterNot (_ == selfAddress)) + peer foreach gossipTo + peer + } + + /** + * Gossips latest gossip to an address. + */ + def gossipTo(address: Address): Unit = + gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = true)) + + def oneWayGossipTo(address: Address): Unit = + gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = false)) + + def gossipTo(address: Address, gossipMsg: GossipEnvelope): Unit = if (address != selfAddress) + coreSender ! SendClusterMessage(address, gossipMsg) + + def notifyListeners(oldGossip: Gossip): Unit = { + if (PublishStateInterval == Duration.Zero) publishState() + + val oldMembersStatus = oldGossip.members.map(m ⇒ (m.address, m.status)) + val newMembersStatus = latestGossip.members.map(m ⇒ (m.address, m.status)) + if (newMembersStatus != oldMembersStatus) + environment notifyMembershipChangeListeners latestGossip.members + } + + def publishState(): Unit = { + environment.publishLatestGossip(latestGossip) + environment.publishLatestStats(stats) + } + + def ping(p: Ping): Unit = sender ! Pong(p) +} + +/** + * INTERNAL API. + */ +private[cluster] final class ClusterCoreSender(selfAddress: Address) extends Actor with ActorLogging { + import InternalClusterAction._ + + /** + * Looks up and returns the remote cluster command connection for the specific address. + */ + private def clusterCoreConnectionFor(address: Address): ActorRef = + context.system.actorFor(RootActorPath(address) / "system" / "cluster" / "core") + + def receive = { + case SendClusterMessage(to, msg) ⇒ + log.debug("Cluster Node [{}] - Trying to send [{}] to [{}]", selfAddress, msg.getClass.getSimpleName, to) + clusterCoreConnectionFor(to) ! msg + } +} + +/** + * INTERNAL API + */ +private[cluster] case class ClusterStats( + receivedGossipCount: Long = 0L, + mergeConflictCount: Long = 0L, + mergeCount: Long = 0L, + mergeDetectedCount: Long = 0L) { + + def incrementReceivedGossipCount(): ClusterStats = + copy(receivedGossipCount = receivedGossipCount + 1) + + def incrementMergeConflictCount(): ClusterStats = + copy(mergeConflictCount = mergeConflictCount + 1) + + def incrementMergeCount(): ClusterStats = + copy(mergeCount = mergeCount + 1) + + def incrementMergeDetectedCount(): ClusterStats = + copy(mergeDetectedCount = mergeDetectedCount + 1) +} \ No newline at end of file diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala new file mode 100644 index 0000000000..29c4a8f562 --- /dev/null +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala @@ -0,0 +1,135 @@ +/** + * Copyright (C) 2009-2012 Typesafe Inc. + */ +package akka.cluster + +import akka.actor.ReceiveTimeout +import akka.actor.ActorLogging +import java.security.MessageDigest +import akka.pattern.CircuitBreaker +import akka.actor.ActorRef +import akka.pattern.CircuitBreakerOpenException +import akka.actor.Address +import akka.actor.Actor +import akka.actor.RootActorPath +import akka.actor.Props +import akka.util.duration._ +import akka.util.Deadline + +/** + * Sent at regular intervals for failure detection. + */ +case class Heartbeat(from: Address) extends ClusterMessage + +/** + * INTERNAL API. + * + * Receives Heartbeat messages and delegates to Cluster. + * Instantiated as a single instance for each Cluster - e.g. heartbeats are serialized + * to Cluster message after message, but concurrent with other types of messages. + */ +private[cluster] final class ClusterHeartbeatDaemon(environment: ClusterEnvironment) extends Actor with ActorLogging { + + def receive = { + case Heartbeat(from) ⇒ environment.failureDetector heartbeat from + } + +} + +/** + * INTERNAL API + */ +private[cluster] object ClusterHeartbeatSender { + /** + * + * Command to [akka.cluster.ClusterHeartbeatSender]], which will send [[akka.cluster.Heartbeat]] + * to the other node. + * Local only, no need to serialize. + */ + case class SendHeartbeat(heartbeatMsg: Heartbeat, to: Address, deadline: Deadline) +} + +/* + * INTERNAL API + * + * This actor is responsible for sending the heartbeat messages to + * other nodes. Netty blocks when sending to broken connections. This actor + * isolates sending to different nodes by using child workers for each target + * address and thereby reduce the risk of irregular heartbeats to healty + * nodes due to broken connections to other nodes. + */ +private[cluster] final class ClusterHeartbeatSender(environment: ClusterEnvironment) extends Actor with ActorLogging { + import ClusterHeartbeatSender._ + + /** + * Looks up and returns the remote cluster heartbeat connection for the specific address. + */ + def clusterHeartbeatConnectionFor(address: Address): ActorRef = + context.system.actorFor(RootActorPath(address) / "system" / "cluster" / "heartbeat") + + val digester = MessageDigest.getInstance("MD5") + + /** + * Child name is MD5 hash of the address. + * FIXME Change to URLEncode when ticket #2123 has been fixed + */ + def encodeChildName(name: String): String = { + digester update name.getBytes("UTF-8") + digester.digest.map { h ⇒ "%02x".format(0xFF & h) }.mkString + } + + def receive = { + case msg @ SendHeartbeat(from, to, deadline) ⇒ + val workerName = encodeChildName(to.toString) + val worker = context.actorFor(workerName) match { + case notFound if notFound.isTerminated ⇒ + context.actorOf(Props(new ClusterHeartbeatSenderWorker( + environment.settings.SendCircuitBreakerSettings, clusterHeartbeatConnectionFor(to))), workerName) + case child ⇒ child + } + worker ! msg + } + +} + +/** + * Responsible for sending [[akka.cluster.Heartbeat]] to one specific address. + * + * Netty blocks when sending to broken connections, and this actor uses + * a configurable circuit breaker to reduce connect attempts to broken + * connections. + * + * @see ClusterHeartbeatSender + */ +private[cluster] final class ClusterHeartbeatSenderWorker( + cbSettings: CircuitBreakerSettings, toRef: ActorRef) + extends Actor with ActorLogging { + + import ClusterHeartbeatSender._ + + val breaker = CircuitBreaker(context.system.scheduler, + cbSettings.maxFailures, cbSettings.callTimeout, cbSettings.resetTimeout). + onHalfOpen(log.debug("CircuitBreaker Half-Open for: [{}]", toRef)). + onOpen(log.debug("CircuitBreaker Open for [{}]", toRef)). + onClose(log.debug("CircuitBreaker Closed for [{}]", toRef)) + + context.setReceiveTimeout(30 seconds) + + def receive = { + case SendHeartbeat(heartbeatMsg, _, deadline) ⇒ + if (!deadline.isOverdue) { + // the CircuitBreaker will measure elapsed time and open if too many long calls + try breaker.withSyncCircuitBreaker { + log.debug("Cluster Node [{}] - Heartbeat to [{}]", heartbeatMsg.from, toRef) + toRef ! heartbeatMsg + if (deadline.isOverdue) log.debug("Sending heartbeat to [{}] took longer than expected", toRef) + } catch { case e: CircuitBreakerOpenException ⇒ /* skip sending heartbeat to broken connection */ } + + // make sure it will cleanup when not used any more + context.setReceiveTimeout(30 seconds) + } + + case ReceiveTimeout ⇒ context.stop(self) // cleanup when not used + + } +} \ No newline at end of file diff --git a/akka-cluster/src/main/scala/akka/cluster/Gossip.scala b/akka-cluster/src/main/scala/akka/cluster/Gossip.scala new file mode 100644 index 0000000000..aa28cc2c03 --- /dev/null +++ b/akka-cluster/src/main/scala/akka/cluster/Gossip.scala @@ -0,0 +1,212 @@ +/** + * Copyright (C) 2009-2012 Typesafe Inc. + */ + +package akka.cluster + +import akka.actor.Address +import scala.collection.immutable.SortedSet +import MemberStatus._ + +object Gossip { + val emptyMembers: SortedSet[Member] = SortedSet.empty +} + +/** + * Represents the state of the cluster; cluster ring membership, ring convergence, meta data - + * all versioned by a vector clock. + * + * When a node is joining the `Member`, with status `Joining`, is added to `members`. + * If the joining node was downed it is moved from `overview.unreachable` (status `Down`) + * to `members` (status `Joining`). It cannot rejoin if not first downed. + * + * When convergence is reached the leader change status of `members` from `Joining` + * to `Up`. + * + * When failure detector consider a node as unavailable it will be moved from + * `members` to `overview.unreachable`. + * + * When a node is downed, either manually or automatically, its status is changed to `Down`. + * It is also removed from `overview.seen` table. The node will reside as `Down` in the + * `overview.unreachable` set until joining again and it will then go through the normal + * joining procedure. + * + * When a `Gossip` is received the version (vector clock) is used to determine if the + * received `Gossip` is newer or older than the current local `Gossip`. The received `Gossip` + * and local `Gossip` is merged in case of conflicting version, i.e. vector clocks without + * same history. When merged the seen table is cleared. + * + * When a node is told by the user to leave the cluster the leader will move it to `Leaving` + * and then rebalance and repartition the cluster and start hand-off by migrating the actors + * from the leaving node to the new partitions. Once this process is complete the leader will + * move the node to the `Exiting` state and once a convergence is complete move the node to + * `Removed` by removing it from the `members` set and sending a `Removed` command to the + * removed node telling it to shut itself down. + */ +case class Gossip( + overview: GossipOverview = GossipOverview(), + members: SortedSet[Member] = Gossip.emptyMembers, // sorted set of members with their status, sorted by address + meta: Map[String, Array[Byte]] = Map.empty, + version: VectorClock = VectorClock()) // vector clock version + extends ClusterMessage // is a serializable cluster message + with Versioned[Gossip] { + + // FIXME can be disabled as optimization + assertInvariants + + private def assertInvariants: Unit = { + val unreachableAndLive = members.intersect(overview.unreachable) + if (unreachableAndLive.nonEmpty) + throw new IllegalArgumentException("Same nodes in both members and unreachable is not allowed, got [%s]" + format unreachableAndLive.mkString(", ")) + + val allowedLiveMemberStatuses: Set[MemberStatus] = Set(Joining, Up, Leaving, Exiting) + def hasNotAllowedLiveMemberStatus(m: Member) = !allowedLiveMemberStatuses.contains(m.status) + if (members exists hasNotAllowedLiveMemberStatus) + throw new IllegalArgumentException("Live members must have status [%s], got [%s]" + format (allowedLiveMemberStatuses.mkString(", "), + (members filter hasNotAllowedLiveMemberStatus).mkString(", "))) + + val seenButNotMember = overview.seen.keySet -- members.map(_.address) -- overview.unreachable.map(_.address) + if (seenButNotMember.nonEmpty) + throw new IllegalArgumentException("Nodes not part of cluster have marked the Gossip as seen, got [%s]" + format seenButNotMember.mkString(", ")) + + } + + /** + * Increments the version for this 'Node'. + */ + def :+(node: VectorClock.Node): Gossip = copy(version = version :+ node) + + /** + * Adds a member to the member node ring. + */ + def :+(member: Member): Gossip = { + if (members contains member) this + else this copy (members = members :+ member) + } + + /** + * Marks the gossip as seen by this node (address) by updating the address entry in the 'gossip.overview.seen' + * Map with the VectorClock (version) for the new gossip. + */ + def seen(address: Address): Gossip = { + if (overview.seen.contains(address) && overview.seen(address) == version) this + else this copy (overview = overview copy (seen = overview.seen + (address -> version))) + } + + /** + * Merges two Gossip instances including membership tables, meta-data tables and the VectorClock histories. + */ + def merge(that: Gossip): Gossip = { + import Member.ordering + + // 1. merge vector clocks + val mergedVClock = this.version merge that.version + + // 2. merge meta-data + val mergedMeta = this.meta ++ that.meta + + // 3. merge unreachable by selecting the single Member with highest MemberStatus out of the Member groups + val mergedUnreachable = Member.pickHighestPriority(this.overview.unreachable, that.overview.unreachable) + + // 4. merge members by selecting the single Member with highest MemberStatus out of the Member groups, + // and exclude unreachable + val mergedMembers = Gossip.emptyMembers :++ Member.pickHighestPriority(this.members, that.members).filterNot(mergedUnreachable.contains) + + // 5. fresh seen table + val mergedSeen = Map.empty[Address, VectorClock] + + Gossip(GossipOverview(mergedSeen, mergedUnreachable), mergedMembers, mergedMeta, mergedVClock) + } + + /** + * Checks if we have a cluster convergence. If there are any unreachable nodes then we can't have a convergence - + * waiting for user to act (issuing DOWN) or leader to act (issuing DOWN through auto-down). + * + * @returns Some(convergedGossip) if convergence have been reached and None if not + */ + def convergence: Boolean = { + val unreachable = overview.unreachable + val seen = overview.seen + + // First check that: + // 1. we don't have any members that are unreachable, or + // 2. all unreachable members in the set have status DOWN + // Else we can't continue to check for convergence + // When that is done we check that all the entries in the 'seen' table have the same vector clock version + // and that all members exists in seen table + val hasUnreachable = unreachable.nonEmpty && unreachable.exists { _.status != Down } + def allMembersInSeen = members.forall(m ⇒ seen.contains(m.address)) + + def seenSame: Boolean = + if (seen.isEmpty) false + else { + val values = seen.values + val seenHead = values.head + values.forall(_ == seenHead) + } + + !hasUnreachable && allMembersInSeen && seenSame + } + + def isLeader(address: Address): Boolean = + members.nonEmpty && (address == members.head.address) + + def leader: Option[Address] = members.headOption.map(_.address) + + def isSingletonCluster: Boolean = members.size == 1 + + /** + * Returns true if the node is UP or JOINING. + */ + def isAvailable(address: Address): Boolean = !isUnavailable(address) + + def isUnavailable(address: Address): Boolean = { + val isUnreachable = overview.unreachable exists { _.address == address } + val hasUnavailableMemberStatus = members exists { m ⇒ m.status.isUnavailable && m.address == address } + isUnreachable || hasUnavailableMemberStatus + } + + def member(address: Address): Member = { + members.find(_.address == address).orElse(overview.unreachable.find(_.address == address)). + getOrElse(Member(address, Removed)) + } + + override def toString = + "Gossip(" + + "overview = " + overview + + ", members = [" + members.mkString(", ") + + "], meta = [" + meta.mkString(", ") + + "], version = " + version + + ")" +} + +/** + * Represents the overview of the cluster, holds the cluster convergence table and set with unreachable nodes. + */ +case class GossipOverview( + seen: Map[Address, VectorClock] = Map.empty, + unreachable: Set[Member] = Set.empty) { + + def isNonDownUnreachable(address: Address): Boolean = + unreachable.exists { m ⇒ m.address == address && m.status != Down } + + override def toString = + "GossipOverview(seen = [" + seen.mkString(", ") + + "], unreachable = [" + unreachable.mkString(", ") + + "])" +} + +/** + * Envelope adding a sender address to the gossip. + */ +case class GossipEnvelope(from: Address, gossip: Gossip, conversation: Boolean = true) extends ClusterMessage + +/** + * When conflicting versions of received and local [[akka.cluster.Gossip]] is detected + * it's forwarded to the leader for conflict resolution. + */ +case class GossipMergeConflict(a: GossipEnvelope, b: GossipEnvelope) extends ClusterMessage + diff --git a/akka-cluster/src/main/scala/akka/cluster/Member.scala b/akka-cluster/src/main/scala/akka/cluster/Member.scala new file mode 100644 index 0000000000..e4fa9f379e --- /dev/null +++ b/akka-cluster/src/main/scala/akka/cluster/Member.scala @@ -0,0 +1,117 @@ +/** + * Copyright (C) 2009-2012 Typesafe Inc. + */ + +package akka.cluster + +import scala.collection.immutable.SortedSet +import scala.collection.GenTraversableOnce +import akka.actor.Address +import MemberStatus._ + +/** + * Represents the address and the current status of a cluster member node. + * + * Note: `hashCode` and `equals` are solely based on the underlying `Address`, not its `MemberStatus`. + */ +class Member(val address: Address, val status: MemberStatus) extends ClusterMessage { + override def hashCode = address.## + override def equals(other: Any) = Member.unapply(this) == Member.unapply(other) + override def toString = "Member(address = %s, status = %s)" format (address, status) + def copy(address: Address = this.address, status: MemberStatus = this.status): Member = new Member(address, status) +} + +/** + * Module with factory and ordering methods for Member instances. + */ +object Member { + + /** + * `Address` ordering type class, sorts addresses by host and port. + */ + implicit val addressOrdering: Ordering[Address] = Ordering.fromLessThan[Address] { (a, b) ⇒ + if (a.host != b.host) a.host.getOrElse("").compareTo(b.host.getOrElse("")) < 0 + else if (a.port != b.port) a.port.getOrElse(0) < b.port.getOrElse(0) + else false + } + + /** + * `Member` ordering type class, sorts members by host and port with the exception that + * it puts all members that are in MemberStatus.EXITING last. + */ + implicit val ordering: Ordering[Member] = Ordering.fromLessThan[Member] { (a, b) ⇒ + if (a.status == Exiting && b.status != Exiting) false + else if (a.status != Exiting && b.status == Exiting) true + else addressOrdering.compare(a.address, b.address) < 0 + } + + def apply(address: Address, status: MemberStatus): Member = new Member(address, status) + + def unapply(other: Any) = other match { + case m: Member ⇒ Some(m.address) + case _ ⇒ None + } + + def pickHighestPriority(a: Set[Member], b: Set[Member]): Set[Member] = { + // group all members by Address => Seq[Member] + val groupedByAddress = (a.toSeq ++ b.toSeq).groupBy(_.address) + // pick highest MemberStatus + (Set.empty[Member] /: groupedByAddress) { + case (acc, (_, members)) ⇒ acc + members.reduceLeft(highestPriorityOf) + } + } + + /** + * Picks the Member with the highest "priority" MemberStatus. + */ + def highestPriorityOf(m1: Member, m2: Member): Member = (m1.status, m2.status) match { + case (Removed, _) ⇒ m1 + case (_, Removed) ⇒ m2 + case (Down, _) ⇒ m1 + case (_, Down) ⇒ m2 + case (Exiting, _) ⇒ m1 + case (_, Exiting) ⇒ m2 + case (Leaving, _) ⇒ m1 + case (_, Leaving) ⇒ m2 + case (Up, Joining) ⇒ m2 + case (Joining, Up) ⇒ m1 + case (Joining, Joining) ⇒ m1 + case (Up, Up) ⇒ m1 + } + + // FIXME Workaround for https://issues.scala-lang.org/browse/SI-5986 + // SortedSet + and ++ operators replaces existing element + // Use these :+ and :++ operators for the Gossip members + implicit def sortedSetWorkaround(sortedSet: SortedSet[Member]): SortedSetWorkaround = new SortedSetWorkaround(sortedSet) + class SortedSetWorkaround(sortedSet: SortedSet[Member]) { + implicit def :+(elem: Member): SortedSet[Member] = { + if (sortedSet.contains(elem)) sortedSet + else sortedSet + elem + } + + implicit def :++(elems: GenTraversableOnce[Member]): SortedSet[Member] = + sortedSet ++ (elems.toSet diff sortedSet) + } +} + +/** + * Defines the current status of a cluster member node + * + * Can be one of: Joining, Up, Leaving, Exiting and Down. + */ +sealed trait MemberStatus extends ClusterMessage { + + /** + * Using the same notion for 'unavailable' as 'non-convergence': DOWN + */ + def isUnavailable: Boolean = this == Down +} + +object MemberStatus { + case object Joining extends MemberStatus + case object Up extends MemberStatus + case object Leaving extends MemberStatus + case object Exiting extends MemberStatus + case object Down extends MemberStatus + case object Removed extends MemberStatus +} \ No newline at end of file diff --git a/akka-cluster/src/main/scala/akka/cluster/RemoteConnectionManager.scala b/akka-cluster/src/main/scala/akka/cluster/RemoteConnectionManager.scala deleted file mode 100644 index 63020367a5..0000000000 --- a/akka-cluster/src/main/scala/akka/cluster/RemoteConnectionManager.scala +++ /dev/null @@ -1,150 +0,0 @@ -/** - * Copyright (C) 2009-2012 Typesafe Inc. - */ - -package akka.cluster - -import akka.actor._ -import akka.remote._ -import akka.routing._ -import akka.event.Logging - -import scala.collection.immutable.Map -import scala.annotation.tailrec - -import java.util.concurrent.atomic.AtomicReference - -/** - * Remote connection manager, manages remote connections, e.g. RemoteActorRef's. - */ -class RemoteConnectionManager( - system: ActorSystemImpl, - remote: RemoteActorRefProvider, - failureDetector: AccrualFailureDetector, - initialConnections: Map[Address, ActorRef] = Map.empty[Address, ActorRef]) - extends ConnectionManager { - - val log = Logging(system, "RemoteConnectionManager") - - // FIXME is this VersionedIterable really needed? It is not used I think. Complicates API. See 'def connections' etc. - case class State(version: Long, connections: Map[Address, ActorRef]) - extends VersionedIterable[ActorRef] { - def iterable: Iterable[ActorRef] = connections.values - } - - private val state: AtomicReference[State] = new AtomicReference[State](newState()) - - /** - * This method is using the FailureDetector to filter out connections that are considered not available. - */ - private def filterAvailableConnections(current: State): State = { - val availableConnections = current.connections filter { entry ⇒ failureDetector.isAvailable(entry._1) } - current copy (version = current.version, connections = availableConnections) - } - - private def newState() = State(Long.MinValue, initialConnections) - - def version: Long = state.get.version - - // FIXME should not return State value but a Seq with connections - def connections = filterAvailableConnections(state.get) - - def size: Int = connections.connections.size - - def connectionFor(address: Address): Option[ActorRef] = connections.connections.get(address) - - def isEmpty: Boolean = connections.connections.isEmpty - - def shutdown() { - state.get.iterable foreach (system.stop(_)) // shut down all remote connections - } - - @tailrec - final def failOver(from: Address, to: Address) { - log.debug("Failing over connection from [{}] to [{}]", from, to) - - val oldState = state.get - var changed = false - - val newMap = oldState.connections map { - case (`from`, actorRef) ⇒ - changed = true - //actorRef.stop() - (to, newConnection(to, actorRef.path)) - case other ⇒ other - } - - if (changed) { - //there was a state change, so we are now going to update the state. - val newState = oldState copy (version = oldState.version + 1, connections = newMap) - - //if we are not able to update, the state, we are going to try again. - if (!state.compareAndSet(oldState, newState)) { - failOver(from, to) // recur - } - } - } - - @tailrec - final def remove(faultyConnection: ActorRef) { - - val oldState = state.get() - var changed = false - - var faultyAddress: Address = null - var newConnections = Map.empty[Address, ActorRef] - - oldState.connections.keys foreach { address ⇒ - val actorRef: ActorRef = oldState.connections.get(address).get - if (actorRef ne faultyConnection) { - newConnections = newConnections + ((address, actorRef)) - } else { - faultyAddress = address - changed = true - } - } - - if (changed) { - //one or more occurrances of the actorRef were removed, so we need to update the state. - val newState = oldState copy (version = oldState.version + 1, connections = newConnections) - - //if we are not able to update the state, we just try again. - if (!state.compareAndSet(oldState, newState)) { - remove(faultyConnection) // recur - } else { - log.debug("Removing connection [{}]", faultyAddress) - } - } - } - - @tailrec - final def putIfAbsent(address: Address, newConnectionFactory: () ⇒ ActorRef): ActorRef = { - - val oldState = state.get() - val oldConnections = oldState.connections - - oldConnections.get(address) match { - case Some(connection) ⇒ connection // we already had the connection, return it - case None ⇒ // we need to create it - val newConnection = newConnectionFactory() - val newConnections = oldConnections + (address -> newConnection) - - //one or more occurrances of the actorRef were removed, so we need to update the state. - val newState = oldState copy (version = oldState.version + 1, connections = newConnections) - - //if we are not able to update the state, we just try again. - if (!state.compareAndSet(oldState, newState)) { - // we failed, need compensating action - system.stop(newConnection) // stop the new connection actor and try again - putIfAbsent(address, newConnectionFactory) // recur - } else { - // we succeeded - log.debug("Adding connection [{}]", address) - newConnection // return new connection actor - } - } - } - - private[cluster] def newConnection(remoteAddress: Address, actorPath: ActorPath) = - new RemoteActorRef(remote, remote.transport, actorPath, Nobody) -} From 6aa36ea70440dda1243b98772e9dbe5f20e03cc1 Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Thu, 5 Jul 2012 15:22:55 +0200 Subject: [PATCH 35/39] Rewriting the CircuitBreakerMTSpec to be less hysterical --- .../akka/pattern/CircuitBreakerMTSpec.scala | 108 ++++++------------ .../scala/akka/pattern/CircuitBreaker.scala | 15 ++- 2 files changed, 43 insertions(+), 80 deletions(-) diff --git a/akka-actor-tests/src/test/scala/akka/pattern/CircuitBreakerMTSpec.scala b/akka-actor-tests/src/test/scala/akka/pattern/CircuitBreakerMTSpec.scala index 35f55d703d..c43531e493 100644 --- a/akka-actor-tests/src/test/scala/akka/pattern/CircuitBreakerMTSpec.scala +++ b/akka-actor-tests/src/test/scala/akka/pattern/CircuitBreakerMTSpec.scala @@ -5,117 +5,81 @@ package akka.pattern import akka.testkit._ import akka.util.duration._ -import org.scalatest.BeforeAndAfter import akka.dispatch.{ Promise, Await, Future } +import akka.actor.ActorSystem -class CircuitBreakerMTSpec extends AkkaSpec with BeforeAndAfter { - - @volatile - var breakers: BreakerState = null - - class BreakerState { - - val halfOpenLatch = new TestLatch(1) - - val breaker = new CircuitBreaker(system.scheduler, 5, 100.millis.dilated, 500.millis.dilated) - .onHalfOpen(halfOpenLatch.countDown()) - - } - - before { - breakers = new BreakerState() - } - - def unreliableCall(param: String) = { - param match { - case "fail" ⇒ throw new RuntimeException("FAIL") - case _ ⇒ param - } - } - - def openBreaker: Unit = { - for (i ← 1 to 5) - Await.result(breakers.breaker.withCircuitBreaker(Future(unreliableCall("fail"))) recoverWith { - case _ ⇒ Promise.successful("OK") - }, 1.second.dilated) - } +class CircuitBreakerMTSpec extends AkkaSpec { "A circuit breaker being called by many threads" must { + val breaker = new CircuitBreaker(system.scheduler, 5, 100.millis.dilated, 500.millis.dilated) + + def openBreaker(): Unit = + Await.ready(Future.sequence((1 to 5).map(_ ⇒ breaker.withCircuitBreaker(Future(throw new RuntimeException("FAIL"))).failed)), 1.second.dilated) + "allow many calls while in closed state with no errors" in { - val futures = for (i ← 1 to 100) yield breakers.breaker.withCircuitBreaker(Future { Thread.sleep(10); unreliableCall("succeed") }) + val futures = for (i ← 1 to 100) yield breaker.withCircuitBreaker(Future { Thread.sleep(10); "succeed" }) - val futureList = Future.sequence(futures) - - val result = Await.result(futureList, 1.second.dilated) + val result = Await.result(Future.sequence(futures), 5.second.dilated) result.size must be(100) - result.distinct.size must be(1) - result.distinct must contain("succeed") + result.toSet must be === Set("succeed") } "transition to open state upon reaching failure limit and fail-fast" in { + openBreaker() - openBreaker - - val futures = for (i ← 1 to 100) yield breakers.breaker.withCircuitBreaker(Future { - Thread.sleep(10); unreliableCall("success") + val futures = for (i ← 1 to 100) yield breaker.withCircuitBreaker(Future { + Thread.sleep(10); "success" }) recoverWith { case _: CircuitBreakerOpenException ⇒ Promise.successful("CBO") } - val futureList = Future.sequence(futures) - - val result = Await.result(futureList, 1.second.dilated) + val result = Await.result(Future.sequence(futures), 5.second.dilated) result.size must be(100) - result.distinct.size must be(1) - result.distinct must contain("CBO") + result.toSet must be === Set("CBO") } "allow a single call through in half-open state" in { - openBreaker + val halfOpenLatch = new TestLatch(1) + breaker.onHalfOpen(halfOpenLatch.countDown()) - Await.ready(breakers.halfOpenLatch, 2.seconds.dilated) + openBreaker() - val futures = for (i ← 1 to 100) yield breakers.breaker.withCircuitBreaker(Future { - Thread.sleep(10); unreliableCall("succeed") + Await.ready(halfOpenLatch, 2.seconds.dilated) + + val futures = for (i ← 1 to 100) yield breaker.withCircuitBreaker(Future { + Thread.sleep(10); "succeed" }) recoverWith { case _: CircuitBreakerOpenException ⇒ Promise.successful("CBO") } - val futureList = Future.sequence(futures) - - val result = Await.result(futureList, 1.second.dilated) + val result = Await.result(Future.sequence(futures), 5.second.dilated) result.size must be(100) - result.distinct.size must be(2) - result.distinct must contain("succeed") - result.distinct must contain("CBO") + result.toSet must be === Set("succeed", "CBO") } "recover and reset the breaker after the reset timeout" in { - openBreaker + val halfOpenLatch = new TestLatch(1) + breaker.onHalfOpen(halfOpenLatch.countDown()) + openBreaker() + Await.ready(halfOpenLatch, 5.seconds.dilated) + Await.ready(breaker.withCircuitBreaker(Future("succeed")), 1.second.dilated) - Await.ready(breakers.halfOpenLatch, 2.seconds.dilated) - - Await.ready(breakers.breaker.withCircuitBreaker(Future(unreliableCall("succeed"))), 1.second.dilated) - - val futures = for (i ← 1 to 100) yield breakers.breaker.withCircuitBreaker(Future { - Thread.sleep(10); unreliableCall("succeed") - }) recoverWith { - case _: CircuitBreakerOpenException ⇒ Promise.successful("CBO") + val futures = (1 to 100) map { + i ⇒ + breaker.withCircuitBreaker(Future { Thread.sleep(10); "succeed" }) recoverWith { + case _: CircuitBreakerOpenException ⇒ Promise.successful("CBO") + } } - val futureList = Future.sequence(futures) - - val result = Await.result(futureList, 1.second.dilated) + val result = Await.result(Future.sequence(futures), 5.second.dilated) result.size must be(100) - result.distinct.size must be(1) - result.distinct must contain("succeed") + result.toSet must be === Set("succeed") } } - } \ No newline at end of file diff --git a/akka-actor/src/main/scala/akka/pattern/CircuitBreaker.scala b/akka-actor/src/main/scala/akka/pattern/CircuitBreaker.scala index ac8fd1c5ed..f8daae1cbc 100644 --- a/akka-actor/src/main/scala/akka/pattern/CircuitBreaker.scala +++ b/akka-actor/src/main/scala/akka/pattern/CircuitBreaker.scala @@ -135,14 +135,13 @@ class CircuitBreaker(scheduler: Scheduler, maxFailures: Int, callTimeout: Durati * @return The result of the call */ def withSyncCircuitBreaker[T](body: ⇒ T): T = { - Await.result(withCircuitBreaker( - { - try - Promise.successful(body)(CircuitBreaker.syncExecutionContext) - catch { - case NonFatal(t) ⇒ Promise.failed(t)(CircuitBreaker.syncExecutionContext) - } - }), callTimeout) + Await.result(withCircuitBreaker({ + try + Promise.successful(body)(CircuitBreaker.syncExecutionContext) + catch { + case NonFatal(t) ⇒ Promise.failed(t)(CircuitBreaker.syncExecutionContext) + } + }), callTimeout) } /** From f88ca9c9fcf898229172ac92a04b3f7c9ea41bde Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Thu, 5 Jul 2012 16:02:14 +0200 Subject: [PATCH 36/39] Remove workaround for the strange gossip merge, see #2303 * Can't reproduce with the actor based Cluster, it was easy to reproduce before. * If it still is a problem it will be detected by the runtime assert. It will show up as: IllegalArgumentException: Nodes not part of cluster have marked the Gossip as seen --- .../src/main/scala/akka/cluster/ClusterDaemon.scala | 9 --------- 1 file changed, 9 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala index 95fab750ca..bf3e5a6b60 100644 --- a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala @@ -165,7 +165,6 @@ private[cluster] final class ClusterDaemon(environment: ClusterEnvironment) exte * INTERNAL API. */ private[cluster] final class ClusterCoreDaemon(environment: ClusterEnvironment) extends Actor with ActorLogging { - // FIXME break up the cluster constructor parameter into something that is easier to test without Cluster import ClusterLeaderAction._ import InternalClusterAction._ import ClusterHeartbeatSender._ @@ -510,14 +509,6 @@ private[cluster] final class ClusterCoreDaemon(environment: ClusterEnvironment) // local gossip is newer localGossip - } else if (!remoteGossip.members.exists(_.address == selfAddress)) { - // FIXME This is a very strange. It can happen when many nodes join at the same time. - // It's not detected as an ordinary version conflict <> - // If we don't handle this situation there will be IllegalArgumentException when marking this as seen - // merge, and new version - val mergedGossip = remoteGossip merge (localGossip :+ Member(selfAddress, Joining)) - mergedGossip :+ vclockNode - } else { // remote gossip is newer remoteGossip From 12254cb42424e5588fa66cf6a2eb1f238f929676 Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Thu, 5 Jul 2012 18:03:21 +0200 Subject: [PATCH 37/39] Fixes #2321 - Upgrade Netty to 3.5.2.Final --- project/AkkaBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/AkkaBuild.scala b/project/AkkaBuild.scala index 8fea416bc8..ea36644120 100644 --- a/project/AkkaBuild.scala +++ b/project/AkkaBuild.scala @@ -524,7 +524,7 @@ object Dependency { object V { val Camel = "2.8.0" val Logback = "1.0.4" - val Netty = "3.5.1.Final" + val Netty = "3.5.2.Final" val OSGi = "4.2.0" val Protobuf = "2.4.1" val ScalaStm = "0.5" From 948287f293eeb527e5b72ed9124a6bb9de0734cf Mon Sep 17 00:00:00 2001 From: Viktor Klang Date: Fri, 6 Jul 2012 11:17:48 +0200 Subject: [PATCH 38/39] Fixes #2302 - removing potential race in ActorSystemSpec --- .../test/scala/akka/actor/ActorSystemSpec.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/akka-actor-tests/src/test/scala/akka/actor/ActorSystemSpec.scala b/akka-actor-tests/src/test/scala/akka/actor/ActorSystemSpec.scala index 1a2d64bb41..e659fbb455 100644 --- a/akka-actor-tests/src/test/scala/akka/actor/ActorSystemSpec.scala +++ b/akka-actor-tests/src/test/scala/akka/actor/ActorSystemSpec.scala @@ -165,7 +165,7 @@ class ActorSystemSpec extends AkkaSpec("""akka.extensions = ["akka.actor.TestExt system.scheduler.scheduleOnce(200 millis) { system.shutdown() } var failing = false var created = Vector.empty[ActorRef] - while (!system.isTerminated && system.uptime < 5) { + while (!system.isTerminated) { try { val t = system.actorOf(Props[ActorSystemSpec.Terminater]) failing must not be true // because once failing => always failing (it’s due to shutdown) @@ -173,12 +173,14 @@ class ActorSystemSpec extends AkkaSpec("""akka.extensions = ["akka.actor.TestExt } catch { case _: IllegalStateException ⇒ failing = true } + + if (!failing && system.uptime >= 5) { + println(created.last) + println(system.asInstanceOf[ExtendedActorSystem].printTree) + fail("System didn't terminate within 5 seconds") + } } - if (system.uptime >= 5) { - println(created.last) - println(system.asInstanceOf[ExtendedActorSystem].printTree) - system.uptime must be < 5L - } + created filter (ref ⇒ !ref.isTerminated && !ref.asInstanceOf[ActorRefWithCell].underlying.isInstanceOf[UnstartedCell]) must be(Seq()) } From fcf0d9ad71d17d2b7cb7bff4440b4de39c520cf0 Mon Sep 17 00:00:00 2001 From: Patrik Nordwall Date: Fri, 6 Jul 2012 15:21:11 +0200 Subject: [PATCH 39/39] Remove superfluous double setReceiveTimeout --- .../src/main/scala/akka/cluster/ClusterHeartbeat.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala index 29c4a8f562..5780f3eda0 100644 --- a/akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterHeartbeat.scala @@ -113,6 +113,7 @@ private[cluster] final class ClusterHeartbeatSenderWorker( onOpen(log.debug("CircuitBreaker Open for [{}]", toRef)). onClose(log.debug("CircuitBreaker Closed for [{}]", toRef)) + // make sure it will cleanup when not used any more context.setReceiveTimeout(30 seconds) def receive = { @@ -124,9 +125,6 @@ private[cluster] final class ClusterHeartbeatSenderWorker( toRef ! heartbeatMsg if (deadline.isOverdue) log.debug("Sending heartbeat to [{}] took longer than expected", toRef) } catch { case e: CircuitBreakerOpenException ⇒ /* skip sending heartbeat to broken connection */ } - - // make sure it will cleanup when not used any more - context.setReceiveTimeout(30 seconds) } case ReceiveTimeout ⇒ context.stop(self) // cleanup when not used