Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds telemetry metrics to dotcom #2039

Merged
merged 13 commits into from
May 9, 2024
4 changes: 4 additions & 0 deletions config/runtime.exs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ config :dotcom, :mbta_api,
{"x-enable-experimental-features", "true"}
]

config :dotcom, :telemetry_metrics_splunk,
token: System.get_env("TELEMETRY_METRICS_SPLUNK_TOKEN"),
url: "https://http-inputs-mbta.splunkcloud.com/services/collector"

config :dotcom, aws_index_prefix: System.get_env("AWS_PLACE_INDEX_PREFIX") || "dotcom-dev"

if config_env() != :test do
Expand Down
5 changes: 5 additions & 0 deletions coveralls.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,28 @@
"skip_files": [
"lib/alerts/supervisor.ex",
"lib/cms/custom_html5_scrubber.ex",
"lib/dotcom/cache/telemetry.ex",
"lib/dotcom/redis/behaviour.ex",
"lib/dotcom/redix/behaviour.ex",
"lib/dotcom/redix/pub_sub/behaviour.ex",
"lib/dotcom/telemetry.ex",
"lib/dotcom_web.ex",
"lib/dotcom_web/live/admin",
"lib/dotcom_web/controllers/bus_stop_change_controller.ex",
"lib/dotcom_web/views/bus_stop_change_view.ex",
"lib/dotcom_web/telemetry.ex",
"lib/feedback/test.ex",
"lib/feedback/fake_date_time.ex",
"lib/feedback/mock_aws.ex",
"lib/green_line/supervisor.ex",
"lib/green_line/cache_supervisor.ex",
"lib/mbta/api/behaviour.ex",
"lib/mix/tasks",
"lib/req/telemetry.ex",
"lib/route_patterns/mock_repo.ex",
"lib/routes/routes.ex",
"lib/routes/mock_repo_api.ex",
"lib/telemetry/helper.ex",
"lib/trip_plan/api/mock_planner.ex",
"lib/trip_plan/geocode/mock_geocode.ex",
"test/algolia/support",
Expand Down
4 changes: 3 additions & 1 deletion lib/dotcom/application.ex
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ defmodule Dotcom.Application do
] ++
if Application.get_env(:dotcom, :env) != :test do
[
# We can't run telemetry in the test environment because none of the levels are running
{Dotcom.Telemetry, []},
{Dotcom.Cache.Telemetry, []},
{DotcomWeb.Telemetry, []},
{Req.Telemetry, []},
# We don't need to run this cache because we are using the local cache for tests
{Dotcom.Cache.TripPlanFeedback.Cache, []}
]
Expand Down
45 changes: 20 additions & 25 deletions lib/dotcom/cache/telemetry.ex
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
defmodule Dotcom.Cache.Telemetry do
@moduledoc """
This supervisor establishes a connection between the telemetry_poller and our telemetry reporters.
This supervisor establishes a connection between the telemetry_poller and the `TelemetryMetricsSplunk` reporter.
Cache stats are emitted by every level of `Dotcom.Cache.Multilevel`.
We poll for them every minute.

Currently, they are passed to two reporters:

The Statsd reporter will eventually be hooked up to Splunk metrics.
For now, it does no harm to emit them even though nothing is listening.

The custom reporter logs in a format that can be picked up in Splunk logs.
Eventually, this should be removed.
"""

use Supervisor
Expand All @@ -22,24 +14,35 @@ defmodule Dotcom.Cache.Telemetry do
end

def init(_arg) do
telemetry_metrics_splunk_config = Application.get_env(:dotcom, :telemetry_metrics_splunk)

children = [
{:telemetry_poller, measurements: periodic_measurements(), period: 60_000},
{Dotcom.Cache.Telemetry.Reporter, metrics: reporter_metrics()},
{TelemetryMetricsStatsd, metrics: statsd_metrics()}
{
TelemetryMetricsSplunk,
[
metrics: metrics(),
token: telemetry_metrics_splunk_config[:token],
url: telemetry_metrics_splunk_config[:url]
]
},
{
:telemetry_poller,
measurements: measurements(), period: :timer.seconds(60), init_delay: :timer.seconds(5)
}
]

Supervisor.init(children, strategy: :one_for_one)
end

defp reporter_metrics do
defp measurements do
[
Metrics.last_value("dotcom.cache.multilevel.l1.stats.updates"),
Metrics.last_value("dotcom.cache.multilevel.l2.stats.updates"),
Metrics.last_value("dotcom.cache.multilevel.l3.stats.updates")
{Dotcom.Cache.Multilevel.Local, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Publisher, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Redis, :dispatch_stats, []}
]
end

defp statsd_metrics do
defp metrics do
[
Metrics.last_value("dotcom.cache.multilevel.l1.stats.hits"),
Metrics.last_value("dotcom.cache.multilevel.l1.stats.misses"),
Expand All @@ -48,12 +51,4 @@ defmodule Dotcom.Cache.Telemetry do
Metrics.last_value("dotcom.cache.multilevel.l3.stats.evictions")
]
end

defp periodic_measurements do
[
{Dotcom.Cache.Multilevel.Local, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Publisher, :dispatch_stats, []},
{Dotcom.Cache.Multilevel.Redis, :dispatch_stats, []}
]
end
end
126 changes: 0 additions & 126 deletions lib/dotcom/cache/telemetry/reporter.ex

This file was deleted.

46 changes: 46 additions & 0 deletions lib/dotcom/telemetry.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
defmodule Dotcom.Telemetry do
@moduledoc """
This Supervisor establishes sends vm stats to the `TelemetryMetricsSplunk` reporter.
No polling occurs as these metrics are emitted regularly anyway.
"""

use Supervisor

alias Telemetry.Metrics

@doc """
Starts the supervisor.
"""
def start_link(arg) do
Supervisor.start_link(__MODULE__, arg, name: __MODULE__)
end

@doc """
Initializes the supervisor.
"""
def init(_arg) do
telemetry_metrics_splunk_config = Application.get_env(:dotcom, :telemetry_metrics_splunk)

children = [
{
TelemetryMetricsSplunk,
[
metrics: metrics(),
token: telemetry_metrics_splunk_config[:token],
url: telemetry_metrics_splunk_config[:url]
]
}
]

Supervisor.init(children, strategy: :one_for_one)
end

defp metrics do
[
Metrics.last_value("vm.memory.total", unit: :byte),
Metrics.last_value("vm.total_run_queue_lengths.total"),
Metrics.last_value("vm.total_run_queue_lengths.cpu"),
Metrics.last_value("vm.system_counts.process_count")
]
end
end
74 changes: 74 additions & 0 deletions lib/dotcom_web/stats.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
defmodule DotcomWeb.Stats do
@moduledoc """
This Agent attaches to telemetry events emitted by Phoenix and aggregates them.
"""

use Agent

@doc """
Starts the Agent and attaches to `[:phoenix, :router_dispatch, :stop]` telemetry events.
"""
def start_link(initial_value \\ %{}) do
:telemetry.attach(
"phoenix-router_dispatch-stop",
[:phoenix, :router_dispatch, :stop],
&__MODULE__.handle_event/4,
nil
)

Agent.start_link(fn -> initial_value end, name: __MODULE__)
end

@doc """
Handles telemetry events and aggregates them by path and status.
"""
def handle_event(_name, measurement, metadata, _config) do
method = metadata.conn.method
path = metadata.route
status = metadata.conn.status
duration = measurement[:duration]

Agent.update(__MODULE__, fn state ->
if Kernel.get_in(state, [method, path, status]) do
Kernel.update_in(state, [method, path, status], &(&1 ++ [duration]))
else
Kernel.put_in(state, [Access.key(method, %{}), Access.key(path, %{}), status], [duration])
end
end)
end

@doc """
Dispatches the aggregated stats to the `[:phoenix, :router_dispatch, :stop]` telemetry event.

Resets the Agent state after dispatching the stats.
"""
def dispatch_stats() do
Enum.each(Agent.get(__MODULE__, & &1), &dispatch_method/1)

Agent.update(__MODULE__, fn _ -> %{} end)
end

defp dispatch_method({method, stats}) do
Enum.each(stats, fn {path, statuses} ->
Enum.each(statuses, fn {status, durations} ->
dispatch_stat(method, path, status, durations)
end)
end)
end

defp dispatch_stat(method, path, status, durations) do
count = Enum.count(durations)

avg =
durations
|> Enum.sum()
|> Kernel.div(count)
|> System.convert_time_unit(:native, :millisecond)

:telemetry.execute([:dotcom_web, :request], %{count: count, avg: avg}, %{
method: method,
path: path,
status: status
})
end
end
Loading
Loading