Cluster sharding health check (#29638)

* and update akka-docs/src/main/paradox/typed/cluster-sharding.md
This commit is contained in:
Christopher Batey 2020-09-25 11:55:29 +01:00 committed by GitHub
parent 90dac8dcbb
commit 91398cdcdd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 262 additions and 0 deletions

View file

@ -245,9 +245,27 @@ akka.cluster.sharding {
# Mostly for the Akka test suite, if off the invalid transition is logged as a warning instead of throwing and
# crashing the shard.
fail-on-invalid-entity-state-transition = off
# Healthcheck that can be used with Akka management health checks: https://doc.akka.io/docs/akka-management/current/healthchecks.html
healthcheck {
# sharding names to check have registered with the coordinator for the health check to pass
# once initial registration has taken place the health check always returns true to prevent the coordinator
# moving making the health check of all nodes fail
# by default no sharding instances are monitored
names = []
# Timeout for the local shard region to respond. This should be lower than your monitoring system's
# timeout for health checks
timeout = 5s
}
}
# //#sharding-ext-config
# Enable health check by default for when Akka management is on the classpath
akka.management.health-checks.readiness-checks {
sharding = "akka.cluster.sharding.ClusterShardingHealthCheck"
}
akka.cluster {
configuration-compatibility-check {
checkers {

View file

@ -0,0 +1,102 @@
/*
* Copyright (C) 2020 Lightbend Inc. <https://www.lightbend.com>
*/
package akka.cluster.sharding
import akka.actor.ActorRef
import akka.actor.ActorSystem
import akka.annotation.ApiMayChange
import akka.event.Logging
import akka.pattern.ask
import akka.util.Timeout
import akka.annotation.InternalApi
import akka.pattern.AskTimeoutException
import akka.util.ccompat.JavaConverters._
import akka.util.JavaDurationConverters._
import scala.concurrent.Future
import com.typesafe.config.Config
import scala.concurrent.duration.FiniteDuration
/**
* Internal API
*/
@InternalApi
private[akka] object ClusterShardingHealthCheckSettings {
def apply(config: Config): ClusterShardingHealthCheckSettings =
new ClusterShardingHealthCheckSettings(
config.getStringList("names").asScala.toSet,
config.getDuration("timeout").asScala)
}
@ApiMayChange
final class ClusterShardingHealthCheckSettings(val names: Set[String], val timeout: FiniteDuration)
private object ClusterShardingHealthCheck {
val Success = Future.successful(true)
}
/**
* INTERNAL API (ctr)
*/
@ApiMayChange
final class ClusterShardingHealthCheck private[akka] (
system: ActorSystem,
settings: ClusterShardingHealthCheckSettings,
shardRegion: String => ActorRef)
extends (() => Future[Boolean]) {
private val log = Logging(system, classOf[ClusterShardingHealthCheck])
def this(system: ActorSystem) =
this(
system,
ClusterShardingHealthCheckSettings(system.settings.config.getConfig("akka.cluster.sharding.healthcheck")),
name => ClusterSharding(system).shardRegion(name))
private implicit val timeout: Timeout = settings.timeout
private implicit val ec = system.dispatchers.internalDispatcher
// Once the check has passed it always does
@volatile private var registered = false
override def apply(): Future[Boolean] = {
if (settings.names.isEmpty || registered) {
ClusterShardingHealthCheck.Success
} else {
Future
.traverse(settings.names) { name =>
shardRegion(name) // this can throw if shard region not registered and it'll fail the check
.ask(ShardRegion.GetShardRegionStatus)
.mapTo[ShardRegion.ShardRegionStatus]
}
.map { allResponses =>
val allRegistered = allResponses.forall(_.registeredWithCoordinator)
if (!allRegistered && log.isInfoEnabled) {
log.info(
"Not all shard regions have registered with coordinator. Still to register: [{}]",
allResponses
.collect {
case response if !response.registeredWithCoordinator => response.typeName
}
.mkString(","))
}
if (allRegistered) {
registered = true
}
allRegistered
}
.recover {
case _: AskTimeoutException =>
if (log.isDebugEnabled) {
log.debug(
"Shard regions [{}] did not respond in time. Failing health check.",
settings.names.mkString(","))
}
false
}
}
}
}

View file

@ -423,6 +423,24 @@ object ShardRegion {
}
}
/**
* INTERNAL API
*
* Discover if the shard region is registered with the coordinator.
* Not serializable as only to be sent to the local shard region
* Response is [[ShardRegionState]]
*/
@InternalApi
private[akka] final object GetShardRegionStatus extends ShardRegionQuery
/**
* INTERNAL API
*
* Status of a ShardRegion. Only for local requests so not serializable.
*/
@InternalApi
private[akka] final class ShardRegionStatus(val typeName: String, val registeredWithCoordinator: Boolean)
private case object Retry extends ShardRegionCommand
private case object RegisterRetry extends ShardRegionCommand
@ -822,6 +840,9 @@ private[akka] class ShardRegion(
case GetShardRegionStats =>
replyToRegionStatsQuery(sender())
case GetShardRegionStatus =>
sender() ! new ShardRegionStatus(typeName, coordinator.isDefined)
case _ => unhandled(query)
}

View file

@ -0,0 +1,99 @@
/*
* Copyright (C) 2020 Lightbend Inc. <https://www.lightbend.com>
*/
package akka.cluster.sharding
import akka.testkit.AkkaSpec
import akka.testkit.TestProbe
import akka.testkit.WithLogCapturing
import com.typesafe.config.ConfigFactory
import org.scalatest.concurrent.ScalaFutures
import scala.concurrent.duration._
object ClusterShardingHealthCheckSpec {
val config = ConfigFactory.parseString("""
akka.loglevel = DEBUG
akka.loggers = ["akka.testkit.SilenceAllTestEventListener"]
""".stripMargin)
}
class ClusterShardingHealthCheckSpec
extends AkkaSpec(ClusterShardingHealthCheckSpec.config)
with WithLogCapturing
with ScalaFutures {
"Sharding health check" should {
"pass if no checks configured" in {
val shardRegionProbe = TestProbe()
val check = new ClusterShardingHealthCheck(
system,
new ClusterShardingHealthCheckSettings(Set.empty, 1.second),
_ => shardRegionProbe.ref)
check().futureValue shouldEqual true
}
"pass if all region return true" in {
val shardRegionProbe = TestProbe()
val check = new ClusterShardingHealthCheck(
system,
new ClusterShardingHealthCheckSettings(Set("cat"), 1.second),
_ => shardRegionProbe.ref)
val response = check()
shardRegionProbe.expectMsg(ShardRegion.GetShardRegionStatus)
shardRegionProbe.reply(new ShardRegion.ShardRegionStatus("cat", true))
response.futureValue shouldEqual true
}
"fail if all region returns false" in {
val shardRegionProbe = TestProbe()
val check = new ClusterShardingHealthCheck(
system,
new ClusterShardingHealthCheckSettings(Set("cat"), 1.second),
_ => shardRegionProbe.ref)
val response = check()
shardRegionProbe.expectMsg(ShardRegion.GetShardRegionStatus)
shardRegionProbe.reply(new ShardRegion.ShardRegionStatus("cat", false))
response.futureValue shouldEqual false
}
"fail if a subset region returns false" in {
val shardRegionProbe = TestProbe()
val check = new ClusterShardingHealthCheck(
system,
new ClusterShardingHealthCheckSettings(Set("cat", "dog"), 1.second),
_ => shardRegionProbe.ref)
val response = check()
shardRegionProbe.expectMsg(ShardRegion.GetShardRegionStatus)
shardRegionProbe.reply(new ShardRegion.ShardRegionStatus("cat", true))
shardRegionProbe.expectMsg(ShardRegion.GetShardRegionStatus)
shardRegionProbe.reply(new ShardRegion.ShardRegionStatus("dog", false))
response.futureValue shouldEqual false
}
"times out" in {
val shardRegionProbe = TestProbe()
val check = new ClusterShardingHealthCheck(
system,
new ClusterShardingHealthCheckSettings(Set("cat"), 100.millis),
_ => shardRegionProbe.ref)
val response = check()
shardRegionProbe.expectMsg(ShardRegion.GetShardRegionStatus)
// don't reply
response.futureValue shouldEqual false
}
"always pass after all regions have reported registered" in {
val shardRegionProbe = TestProbe()
val check = new ClusterShardingHealthCheck(
system,
new ClusterShardingHealthCheckSettings(Set("cat"), 1.second),
_ => shardRegionProbe.ref)
val response = check()
shardRegionProbe.expectMsg(ShardRegion.GetShardRegionStatus)
shardRegionProbe.reply(new ShardRegion.ShardRegionStatus("cat", true))
response.futureValue shouldEqual true
val secondResponse = check()
shardRegionProbe.expectNoMessage()
secondResponse.futureValue shouldEqual true
}
}
}

View file

@ -464,6 +464,28 @@ rebalanced to other nodes.
See @ref:[How To Startup when Cluster Size Reached](cluster.md#how-to-startup-when-a-cluster-size-is-reached)
for more information about `min-nr-of-members`.
## Health check
An [Akka Management compatible health check](https://doc.akka.io/docs/akka-management/current/healthchecks.html) is included that returns healthy once the local shard region
has registered with the coordinator. This health check should be used in cases where you don't want to receive production traffic until the local shard region is ready to retrieve locations
for shards. For shard regions that aren't critical and therefore should not block this node becoming ready do not include them.
The health check does not fail after an initial successful check. Once a shard region is registered and is operational it stays available for incoming message.
Cluster sharding enables the health check automatically. To disable:
```ruby
akka.management.health-checks.readiness-checks {
sharding = ""
}
```
Monitoring of each shard region is off by default. Add them by defining the entity type names (`EntityTypeKey.name`):
```ruby
akka.cluster.sharding.healthcheck.names = ["counter-1", "HelloWorld"]
```
## Inspecting cluster sharding state
Two requests to inspect the cluster state are available: