Nodes not part of cluster have marked the Gossip as seen, see #3031

* Problem may occur when joining member with same hostname:port again, after downing. * Reproduced with StressSpec exerciseJoinRemove with fixed port that joins and shutdown several times. * Real solution for this will be covered by ticket #2788 by adding uid to member identifier, but as first step we need to support this scenario with current design. * Use unique node identifier for vector clock to avoid mixup of old and new member instance. * Support transition from Down to Joining in Gossip merge * Don't gossip to unknown or unreachable members.
2013-02-12 21:45:41 +01:00 · 2013-02-12 21:45:41 +01:00 · b349ad8d87
commit b349ad8d87
parent cab78e5174
4 changed files with 28 additions and 8 deletions
--- a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala
+++ b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala
@ -9,6 +9,7 @@ import scala.collection.immutable
 import scala.concurrent.duration._
 import scala.concurrent.forkjoin.ThreadLocalRandom
 import scala.util.control.NonFatal
+import java.util.UUID
 import akka.actor.{ Actor, ActorLogging, ActorRef, Address, Cancellable, Props, PoisonPill, ReceiveTimeout, RootActorPath, Scheduler }
 import akka.actor.OneForOneStrategy
 import akka.actor.Status.Failure
@ -219,7 +220,8 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
  import cluster.{ selfAddress, scheduler, failureDetector }
  import cluster.settings._

-  val vclockNode = VectorClock.Node(selfAddress.toString)
+  // FIXME the UUID should not be needed when Address contains uid, ticket #2788
+  val vclockNode = VectorClock.Node(selfAddress.toString + "-" + UUID.randomUUID())

  // note that self is not initially member,
  // and the Gossip is not versioned for this 'Node' yet
@ -507,10 +509,10 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
    val localGossip = latestGossip

    if (remoteGossip.overview.unreachable.exists(_.address == selfAddress)) {
-      // FIXME how should we handle this situation?
-      log.debug("Received gossip with self as unreachable, from [{}]", from)
-
-    } else if (!localGossip.overview.isNonDownUnreachable(from)) {
+      log.debug("Ignoring received gossip with self [{}] as unreachable, from [{}]", selfAddress, from)
+    } else if (localGossip.overview.isNonDownUnreachable(from)) {
+      log.debug("Ignoring received gossip from unreachable [{}] ", from)
+    } else {

      // leader handles merge conflicts, or when they have different views of how is leader
      val handleMerge = localGossip.leader == Some(selfAddress) || localGossip.leader != remoteGossip.leader
@ -830,7 +832,8 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
    gossipTo(address, GossipEnvelope(selfAddress, latestGossip, conversation = false))

  def gossipTo(address: Address, gossipMsg: GossipEnvelope): Unit =
-    if (address != selfAddress) clusterCore(address) ! gossipMsg
+    if (address != selfAddress && gossipMsg.gossip.members.exists(_.address == address))
+      clusterCore(address) ! gossipMsg

  def publish(newGossip: Gossip): Unit = {
    publisher ! PublishChanges(newGossip)