Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

Commit

Permalink
feat(consul): collect server network lan rtt (#1035)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyam8 authored Dec 23, 2022
1 parent 0539645 commit b99b2c4
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 4 deletions.
7 changes: 5 additions & 2 deletions modules/consul/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ Consul Agent.
autopilot [health status](https://developer.hashicorp.com/consul/api-docs/operator/autopilot#read-health)).
- `node:read`, `service:read` (for
querying [checks](https://developer.hashicorp.com/consul/api-docs/agent/check#list-checks)).
- `agent:read` (for querying [metrics](https://developer.hashicorp.com/consul/api-docs/agent#view-metrics)
and [configuration](https://developer.hashicorp.com/consul/api-docs/agent#read-configuration)).
- `agent:read` (for
querying [configuration](https://developer.hashicorp.com/consul/api-docs/agent#read-configuration),
[metrics](https://developer.hashicorp.com/consul/api-docs/agent#view-metrics),
and [lan coordinates](https://developer.hashicorp.com/consul/api-docs/coordinate#read-lan-coordinates-for-all-nodes)).

## Metrics

Expand Down Expand Up @@ -63,6 +65,7 @@ Labels per scope:
| autopilot_server_stable_time | global | stable | seconds | yes | yes | no |
| autopilot_server_serf_status | global | active, failed, left, none | status | yes | yes | no |
| autopilot_server_voter_status | global | voter, not_voter | status | yes | yes | no |
| network_lan_rtt | global | min, max, avg | ms | yes | yes | no |
| raft_commit_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | no | no |
| raft_commits_rate | global | commits | commits/s | yes | no | no |
| raft_leader_last_contact_time | global | quantile_0.5, quantile_0.9, quantile_0.99 | ms | yes | no | no |
Expand Down
19 changes: 19 additions & 0 deletions modules/consul/charts.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ const (
prioAutopilotServerSerfStatus
prioAutopilotServerVoterStatus

prioNetworkLanRTT

prioRPCRequests
prioRPCRequestsExceeded
prioRPCRequestsFailed
Expand Down Expand Up @@ -93,6 +95,8 @@ var (
raftLeadershipTransitionsRateChart.Copy(),
serverLeadershipStatusChart.Copy(),

networkLanRTTChart.Copy(),

clientRPCRequestsRateChart.Copy(),
clientRPCRequestsExceededRateChart.Copy(),
clientRPCRequestsFailedRateChart.Copy(),
Expand Down Expand Up @@ -317,6 +321,21 @@ var (
},
}

networkLanRTTChart = module.Chart{
ID: "network_lan_rtt",
Title: "Network lan RTT",
Units: "ms",
Fam: "network rtt",
Ctx: "consul.network_lan_rtt",
Type: module.Area,
Priority: prioNetworkLanRTT,
Dims: module.Dims{
{ID: "network_lan_rtt_min", Name: "min", Div: 1e6},
{ID: "network_lan_rtt_max", Name: "max", Div: 1e6},
{ID: "network_lan_rtt_avg", Name: "avg", Div: 1e6},
},
}

clientRPCRequestsRateChart = module.Chart{
ID: "client_rpc_requests_rate",
Title: "Client RPC requests",
Expand Down
3 changes: 3 additions & 0 deletions modules/consul/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ func (c *Consul) collect() (map[string]int64, error) {
if err := c.collectAutopilotHealth(mx); err != nil {
return nil, err
}
if err := c.collectNetworkRTT(mx); err != nil {
return nil, err
}
}

if c.isTelemetryPrometheusEnabled() {
Expand Down
75 changes: 75 additions & 0 deletions modules/consul/collect_net_rtt.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package consul

import (
"math"
"time"

"github.com/netdata/go.d.plugin/pkg/metrics"
)

const (
// https://developer.hashicorp.com/consul/api-docs/coordinate#read-lan-coordinates-for-all-nodes
urlPathCoordinateNodes = "/v1/coordinate/nodes"
)

type nodeCoordinates struct {
Node string
Coord struct {
Vec []float64
Error float64
Adjustment float64
Height float64
}
}

func (c *Consul) collectNetworkRTT(mx map[string]int64) error {
var coords []nodeCoordinates

if err := c.doOKDecode(urlPathCoordinateNodes, &coords); err != nil {
return err
}

var thisNode nodeCoordinates
var ok bool

coords, thisNode, ok = removeNodeCoordinates(coords, c.cfg.Config.NodeName)
if !ok || len(coords) == 0 {
return nil
}

sum := metrics.NewSummary()
for _, v := range coords {
d := calcDistance(thisNode, v)
sum.Observe(d.Seconds())
}
sum.WriteTo(mx, "network_lan_rtt", 1e9, 1)

return nil
}

func calcDistance(a, b nodeCoordinates) time.Duration {
// https://developer.hashicorp.com/consul/docs/architecture/coordinates#working-with-coordinates
sum := 0.0
for i := 0; i < len(a.Coord.Vec); i++ {
diff := a.Coord.Vec[i] - b.Coord.Vec[i]
sum += diff * diff
}

rtt := math.Sqrt(sum) + a.Coord.Height + b.Coord.Height

adjusted := rtt + a.Coord.Adjustment + b.Coord.Adjustment
if adjusted > 0.0 {
rtt = adjusted
}

return time.Duration(rtt * 1e9) // nanoseconds
}

func removeNodeCoordinates(coords []nodeCoordinates, node string) ([]nodeCoordinates, nodeCoordinates, bool) {
for i, v := range coords {
if v.Node == node {
return append(coords[:i], coords[i+1:]...), v, true
}
}
return coords, nodeCoordinates{}, false
}
28 changes: 26 additions & 2 deletions modules/consul/consul_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ var (
dataV1132ServerPromMetrics, _ = os.ReadFile("testdata/v1.13.2/server_v1-agent-metrics.txt")
dataV1132ServerPromMetricsWithHostname, _ = os.ReadFile("testdata/v1.13.2/server_v1-agent-metrics_with_hostname.txt")
dataV1132ServerOperatorAutopilotHealth, _ = os.ReadFile("testdata/v1.13.2/server_v1-operator-autopilot-health.json")
dataV1132ServerCoordinateNodes, _ = os.ReadFile("testdata/v1.13.2/server_v1-coordinate-nodes.json")
)

func Test_testDataIsValid(t *testing.T) {
Expand All @@ -37,6 +38,7 @@ func Test_testDataIsValid(t *testing.T) {
"dataV1132ServerPromMetrics": dataV1132ServerPromMetrics,
"dataV1132ServerPromMetricsWithHostname": dataV1132ServerPromMetricsWithHostname,
"dataV1132ServerOperatorAutopilotHealth": dataV1132ServerOperatorAutopilotHealth,
"dataV1132ServerCoordinateNodes": dataV1132ServerCoordinateNodes,
} {
require.NotNilf(t, data, name)
}
Expand Down Expand Up @@ -144,7 +146,7 @@ func TestConsul_Collect(t *testing.T) {
"autopilot_server_sefStatus_failed": 0,
"autopilot_server_sefStatus_left": 0,
"autopilot_server_sefStatus_none": 0,
"autopilot_server_stable_time": 109161,
"autopilot_server_stable_time": 265849,
"autopilot_server_voter_no": 0,
"autopilot_server_voter_yes": 1,
"client_rpc": 6838,
Expand All @@ -171,6 +173,11 @@ func TestConsul_Collect(t *testing.T) {
"kvs_apply_quantile=0.9": 0,
"kvs_apply_quantile=0.99": 0,
"kvs_apply_sum": 0,
"network_lan_rtt_avg": 737592,
"network_lan_rtt_count": 2,
"network_lan_rtt_max": 991168,
"network_lan_rtt_min": 484017,
"network_lan_rtt_sum": 1475185,
"raft_apply": 10681000,
"raft_boltdb_freelistBytes": 11264,
"raft_boltdb_logsPerBatch_count": 12360,
Expand Down Expand Up @@ -238,7 +245,7 @@ func TestConsul_Collect(t *testing.T) {
"autopilot_server_sefStatus_failed": 0,
"autopilot_server_sefStatus_left": 0,
"autopilot_server_sefStatus_none": 0,
"autopilot_server_stable_time": 109180,
"autopilot_server_stable_time": 265825,
"autopilot_server_voter_no": 0,
"autopilot_server_voter_yes": 1,
"client_rpc": 6838,
Expand All @@ -265,6 +272,11 @@ func TestConsul_Collect(t *testing.T) {
"kvs_apply_quantile=0.9": 0,
"kvs_apply_quantile=0.99": 0,
"kvs_apply_sum": 0,
"network_lan_rtt_avg": 737592,
"network_lan_rtt_count": 2,
"network_lan_rtt_max": 991168,
"network_lan_rtt_min": 484017,
"network_lan_rtt_sum": 1475185,
"raft_apply": 10681000,
"raft_boltdb_freelistBytes": 11264,
"raft_boltdb_logsPerBatch_count": 12360,
Expand Down Expand Up @@ -329,6 +341,7 @@ func TestConsul_Collect(t *testing.T) {
"autopilot_server_sefStatus_failed": 0,
"autopilot_server_sefStatus_left": 0,
"autopilot_server_sefStatus_none": 0,
"autopilot_server_stable_time": 265805,
"autopilot_server_voter_no": 0,
"autopilot_server_voter_yes": 1,
"health_check_chk1_critical_status": 0,
Expand All @@ -347,6 +360,11 @@ func TestConsul_Collect(t *testing.T) {
"health_check_mysql_maintenance_status": 0,
"health_check_mysql_passing_status": 0,
"health_check_mysql_warning_status": 0,
"network_lan_rtt_avg": 737592,
"network_lan_rtt_count": 2,
"network_lan_rtt_max": 991168,
"network_lan_rtt_min": 484017,
"network_lan_rtt_sum": 1475185,
},
},
"success on response from Consul v1.13.2 client": {
Expand Down Expand Up @@ -425,6 +443,8 @@ func caseConsulV1132ServerResponse(t *testing.T) (*Consul, func()) {
_, _ = w.Write(dataV1132ServerPromMetrics)
case r.URL.Path == urlPathOperationAutopilotHealth:
_, _ = w.Write(dataV1132ServerOperatorAutopilotHealth)
case r.URL.Path == urlPathCoordinateNodes:
_, _ = w.Write(dataV1132ServerCoordinateNodes)
default:
w.WriteHeader(http.StatusNotFound)
}
Expand All @@ -451,6 +471,8 @@ func caseConsulV1132ServerWithHostnameResponse(t *testing.T) (*Consul, func()) {
_, _ = w.Write(dataV1132ServerPromMetricsWithHostname)
case r.URL.Path == urlPathOperationAutopilotHealth:
_, _ = w.Write(dataV1132ServerOperatorAutopilotHealth)
case r.URL.Path == urlPathCoordinateNodes:
_, _ = w.Write(dataV1132ServerCoordinateNodes)
default:
w.WriteHeader(http.StatusNotFound)
}
Expand All @@ -475,6 +497,8 @@ func caseConsulV1132ServerWithDisabledPrometheus(t *testing.T) (*Consul, func())
_, _ = w.Write(datav1132Checks)
case urlPathOperationAutopilotHealth:
_, _ = w.Write(dataV1132ServerOperatorAutopilotHealth)
case urlPathCoordinateNodes:
_, _ = w.Write(dataV1132ServerCoordinateNodes)
default:
w.WriteHeader(http.StatusNotFound)
}
Expand Down
59 changes: 59 additions & 0 deletions modules/consul/testdata/v1.13.2/server_v1-coordinate-nodes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
[
{
"Node": "satya-vm",
"Segment": "",
"Coord": {
"Vec": [
0.014829503547751722,
0.0072173849395880596,
0.004329474334739038,
-0.0032798752739064438,
-0.010134170963372591,
-0.008257638503292454,
0.00752142875530981,
0.0017901665053347217
],
"Error": 0.493977389081921,
"Adjustment": 0.00017401717315766792,
"Height": 2.8272088782225915e-05
}
},
{
"Node": "satya-vm2",
"Segment": "",
"Coord": {
"Vec": [
0.01485399579339927,
0.007233318963330601,
0.004314864811042585,
-0.0032764668107421653,
-0.010133938771787391,
-0.008238915750721635,
0.0075168683512753035,
0.001776534386752108
],
"Error": 0.3003366063730667,
"Adjustment": 0.00019935098724887628,
"Height": 4.192904954404545e-05
}
},
{
"Node": "satya-vm3",
"Segment": "",
"Coord": {
"Vec": [
0.014782092899311995,
0.007186516660508205,
0.004357885422476095,
-0.003286526239099157,
-0.010134722455521066,
-0.008294075475167818,
0.007530358624901773,
0.0018166544975743123
],
"Error": 0.12048664650994341,
"Adjustment": 0.00014477073973997567,
"Height": 0.0005656138448826895
}
}
]

0 comments on commit b99b2c4

Please sign in to comment.