Skip to content

Commit

Permalink
Adds telemetry metrics to dotcom (#2039)
Browse files Browse the repository at this point in the history
* trying out metrics

* rearrange config

* change port

* this is the right url

* nebulex and req stats being sent

* vm stats are coming through

* phoenix telemetry

* docs and tests

* credo

* ignore files for coverage

* missing docs

* remove logger from helper

* mix lock file
  • Loading branch information
anthonyshull authored May 9, 2024
1 parent a6f50a0 commit 401e6f9
Show file tree
Hide file tree
Showing 18 changed files with 460 additions and 285 deletions.
4 changes: 4 additions & 0 deletions config/runtime.exs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ config :dotcom, :mbta_api,
{"x-enable-experimental-features", "true"}
]

config :dotcom, :telemetry_metrics_splunk,
token: System.get_env("TELEMETRY_METRICS_SPLUNK_TOKEN"),
url: "https://http-inputs-mbta.splunkcloud.com/services/collector"

config :dotcom, aws_index_prefix: System.get_env("AWS_PLACE_INDEX_PREFIX") || "dotcom-dev"

if config_env() != :test do
Expand Down
5 changes: 5 additions & 0 deletions coveralls.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,28 @@
"skip_files": [
"lib/alerts/supervisor.ex",
"lib/cms/custom_html5_scrubber.ex",
"lib/dotcom/cache/telemetry.ex",
"lib/dotcom/redis/behaviour.ex",
"lib/dotcom/redix/behaviour.ex",
"lib/dotcom/redix/pub_sub/behaviour.ex",
"lib/dotcom/telemetry.ex",
"lib/dotcom_web.ex",
"lib/dotcom_web/live/admin",
"lib/dotcom_web/controllers/bus_stop_change_controller.ex",
"lib/dotcom_web/views/bus_stop_change_view.ex",
"lib/dotcom_web/telemetry.ex",
"lib/feedback/test.ex",
"lib/feedback/fake_date_time.ex",
"lib/feedback/mock_aws.ex",
"lib/green_line/supervisor.ex",
"lib/green_line/cache_supervisor.ex",
"lib/mbta/api/behaviour.ex",
"lib/mix/tasks",
"lib/req/telemetry.ex",
"lib/route_patterns/mock_repo.ex",
"lib/routes/routes.ex",
"lib/routes/mock_repo_api.ex",
"lib/telemetry/helper.ex",
"lib/trip_plan/api/mock_planner.ex",
"lib/trip_plan/geocode/mock_geocode.ex",
"test/algolia/support",
Expand Down
4 changes: 3 additions & 1 deletion lib/dotcom/application.ex
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ defmodule Dotcom.Application do
] ++
if Application.get_env(:dotcom, :env) != :test do
[
# We can't run telemetry in the test environment because none of the levels are running
{Dotcom.Telemetry, []},
{Dotcom.Cache.Telemetry, []},
{DotcomWeb.Telemetry, []},
{Req.Telemetry, []},
# We don't need to run this cache because we are using the local cache for tests
{Dotcom.Cache.TripPlanFeedback.Cache, []}
]
Expand Down
45 changes: 20 additions & 25 deletions lib/dotcom/cache/telemetry.ex
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
defmodule Dotcom.Cache.Telemetry do
@moduledoc """
This supervisor establishes a connection between the telemetry_poller and our telemetry reporters.
This supervisor establishes a connection between the telemetry_poller and the `TelemetryMetricsSplunk` reporter.
Cache stats are emitted by every level of `Dotcom.Cache.Multilevel`.
We poll for them every minute.
Currently, they are passed to two reporters:
The Statsd reporter will eventually be hooked up to Splunk metrics.
For now, it does no harm to emit them even though nothing is listening.
The custom reporter logs in a format that can be picked up in Splunk logs.
Eventually, this should be removed.
"""

use Supervisor
Expand All @@ -22,24 +14,35 @@ defmodule Dotcom.Cache.Telemetry do
end

def init(_arg) do
telemetry_metrics_splunk_config = Application.get_env(:dotcom, :telemetry_metrics_splunk)

children = [
{:telemetry_poller, measurements: periodic_measurements(), period: 60_000},
{Dotcom.Cache.Telemetry.Reporter, metrics: reporter_metrics()},
{TelemetryMetricsStatsd, metrics: statsd_metrics()}
{
TelemetryMetricsSplunk,
[
metrics: metrics(),
token: telemetry_metrics_splunk_config[:token],
url: telemetry_metrics_splunk_config[:url]
]
},
{
:telemetry_poller,
measurements: measurements(), period: :timer.seconds(60), init_delay: :timer.seconds(5)
}
]

Supervisor.init(children, strategy: :one_for_one)
end

defp reporter_metrics do
defp measurements do
[
Metrics.last_value("dotcom.cache.multilevel.l1.stats.updates"),
Metrics.last_value("dotcom.cache.multilevel.l2.stats.updates"),
Metrics.last_value("dotcom.cache.multilevel.l3.stats.updates")
{Dotcom.Cache.Multilevel.Local, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Publisher, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Redis, :dispatch_stats, []}
]
end

defp statsd_metrics do
defp metrics do
[
Metrics.last_value("dotcom.cache.multilevel.l1.stats.hits"),
Metrics.last_value("dotcom.cache.multilevel.l1.stats.misses"),
Expand All @@ -48,12 +51,4 @@ defmodule Dotcom.Cache.Telemetry do
Metrics.last_value("dotcom.cache.multilevel.l3.stats.evictions")
]
end

defp periodic_measurements do
[
{Dotcom.Cache.Multilevel.Local, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Publisher, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Redis, :dispatch_stats, []}
]
end
end
126 changes: 0 additions & 126 deletions lib/dotcom/cache/telemetry/reporter.ex

This file was deleted.

46 changes: 46 additions & 0 deletions lib/dotcom/telemetry.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
defmodule Dotcom.Telemetry do
@moduledoc """
This Supervisor establishes sends vm stats to the `TelemetryMetricsSplunk` reporter.
No polling occurs as these metrics are emitted regularly anyway.
"""

use Supervisor

alias Telemetry.Metrics

@doc """
Starts the supervisor.
"""
def start_link(arg) do
Supervisor.start_link(__MODULE__, arg, name: __MODULE__)
end

@doc """
Initializes the supervisor.
"""
def init(_arg) do
telemetry_metrics_splunk_config = Application.get_env(:dotcom, :telemetry_metrics_splunk)

children = [
{
TelemetryMetricsSplunk,
[
metrics: metrics(),
token: telemetry_metrics_splunk_config[:token],
url: telemetry_metrics_splunk_config[:url]
]
}
]

Supervisor.init(children, strategy: :one_for_one)
end

defp metrics do
[
Metrics.last_value("vm.memory.total", unit: :byte),
Metrics.last_value("vm.total_run_queue_lengths.total"),
Metrics.last_value("vm.total_run_queue_lengths.cpu"),
Metrics.last_value("vm.system_counts.process_count")
]
end
end
74 changes: 74 additions & 0 deletions lib/dotcom_web/stats.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
defmodule DotcomWeb.Stats do
@moduledoc """
This Agent attaches to telemetry events emitted by Phoenix and aggregates them.
"""

use Agent

@doc """
Starts the Agent and attaches to `[:phoenix, :router_dispatch, :stop]` telemetry events.
"""
def start_link(initial_value \\ %{}) do
:telemetry.attach(
"phoenix-router_dispatch-stop",
[:phoenix, :router_dispatch, :stop],
&__MODULE__.handle_event/4,
nil
)

Agent.start_link(fn -> initial_value end, name: __MODULE__)
end

@doc """
Handles telemetry events and aggregates them by path and status.
"""
def handle_event(_name, measurement, metadata, _config) do
method = metadata.conn.method
path = metadata.route
status = metadata.conn.status
duration = measurement[:duration]

Agent.update(__MODULE__, fn state ->
if Kernel.get_in(state, [method, path, status]) do
Kernel.update_in(state, [method, path, status], &(&1 ++ [duration]))
else
Kernel.put_in(state, [Access.key(method, %{}), Access.key(path, %{}), status], [duration])
end
end)
end

@doc """
Dispatches the aggregated stats to the `[:phoenix, :router_dispatch, :stop]` telemetry event.
Resets the Agent state after dispatching the stats.
"""
def dispatch_stats() do
Enum.each(Agent.get(__MODULE__, & &1), &dispatch_method/1)

Agent.update(__MODULE__, fn _ -> %{} end)
end

defp dispatch_method({method, stats}) do
Enum.each(stats, fn {path, statuses} ->
Enum.each(statuses, fn {status, durations} ->
dispatch_stat(method, path, status, durations)
end)
end)
end

defp dispatch_stat(method, path, status, durations) do
count = Enum.count(durations)

avg =
durations
|> Enum.sum()
|> Kernel.div(count)
|> System.convert_time_unit(:native, :millisecond)

:telemetry.execute([:dotcom_web, :request], %{count: count, avg: avg}, %{
method: method,
path: path,
status: status
})
end
end
Loading

0 comments on commit 401e6f9

Please sign in to comment.