Handle CoordinatedShutdown exiting-completed when not joined, #26832

* assertion failed: Nodes not part of cluster have marked the Gossip as seen
* trying to mark the Gossip as seen before it has joined, which may happen
  if CoordinatedShutdown is running before the node has joined
This commit is contained in:
Patrik Nordwall 2019-04-30 14:47:41 +02:00
parent 7b59c0c785
commit a77db34f8f

View file

@ -826,33 +826,36 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
logInfo("Exiting completed") logInfo("Exiting completed")
// ExitingCompleted sent via CoordinatedShutdown to continue the leaving process. // ExitingCompleted sent via CoordinatedShutdown to continue the leaving process.
exitingTasksInProgress = false exitingTasksInProgress = false
// mark as seen // status Removed also before joining
membershipState = membershipState.seen() if (membershipState.selfMember.status != MemberStatus.Removed) {
assertLatestGossip() // mark as seen
publishMembershipState() membershipState = membershipState.seen()
assertLatestGossip()
publishMembershipState()
// Let others know (best effort) before shutdown. Otherwise they will not see // Let others know (best effort) before shutdown. Otherwise they will not see
// convergence of the Exiting state until they have detected this node as // convergence of the Exiting state until they have detected this node as
// unreachable and the required downing has finished. They will still need to detect // unreachable and the required downing has finished. They will still need to detect
// unreachable, but Exiting unreachable will be removed without downing, i.e. // unreachable, but Exiting unreachable will be removed without downing, i.e.
// normally the leaving of a leader will be graceful without the need // normally the leaving of a leader will be graceful without the need
// for downing. However, if those final gossip messages never arrive it is // for downing. However, if those final gossip messages never arrive it is
// alright to require the downing, because that is probably caused by a // alright to require the downing, because that is probably caused by a
// network failure anyway. // network failure anyway.
gossipRandomN(NumberOfGossipsBeforeShutdownWhenLeaderExits) gossipRandomN(NumberOfGossipsBeforeShutdownWhenLeaderExits)
// send ExitingConfirmed to two potential leaders // send ExitingConfirmed to two potential leaders
val membersExceptSelf = latestGossip.members.filter(_.uniqueAddress != selfUniqueAddress) val membersExceptSelf = latestGossip.members.filter(_.uniqueAddress != selfUniqueAddress)
membershipState.leaderOf(membersExceptSelf) match { membershipState.leaderOf(membersExceptSelf) match {
case Some(node1) => case Some(node1) =>
clusterCore(node1.address) ! ExitingConfirmed(selfUniqueAddress) clusterCore(node1.address) ! ExitingConfirmed(selfUniqueAddress)
membershipState.leaderOf(membersExceptSelf.filterNot(_.uniqueAddress == node1)) match { membershipState.leaderOf(membersExceptSelf.filterNot(_.uniqueAddress == node1)) match {
case Some(node2) => case Some(node2) =>
clusterCore(node2.address) ! ExitingConfirmed(selfUniqueAddress) clusterCore(node2.address) ! ExitingConfirmed(selfUniqueAddress)
case None => // no more potential leader case None => // no more potential leader
} }
case None => // no leader case None => // no leader
}
} }
shutdown() shutdown()