Skip to content

Commit

Permalink
AI-4795 Coverage for ray_vllm* metrics (#19358)
Browse files Browse the repository at this point in the history
* Add custom scraper

* add scraper class

* With debug logs

* Remove excessive debug logs

* add metric prefix variable

* Add ray_vllm: metric map

* Add unit tests for ray_vllm: metrics

* Remove custom scraper class

* Add ray test fixture

* Update changelog

* linting

* Linting

* Updated changelog to use added

* Update vllm/changelog.d/19358.added

Co-authored-by: Kyle Neale <kyle.neale@datadoghq.com>

* Update changelog message

* Update changelog message

* Update changelog message

* Add comment for metric map

* Edit comment

* Move comment to metrics.py for ray_vllm namespaced metrics

---------

Co-authored-by: Kyle Neale <kyle.neale@datadoghq.com>
  • Loading branch information
UTXOnly and Kyle-Neale authored Jan 13, 2025
1 parent a189c7a commit f082dc1
Show file tree
Hide file tree
Showing 8 changed files with 278 additions and 15 deletions.
1 change: 1 addition & 0 deletions vllm/changelog.d/19358.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add coverage for vLLM metrics prefixed with `ray_vllm`
7 changes: 5 additions & 2 deletions vllm/datadog_checks/vllm/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under a 3-clause BSD style license (see LICENSE)
from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2 # noqa: F401

from .metrics import METRIC_MAP, RENAME_LABELS_MAP
from .metrics import METRIC_MAP, RAY_METRIC_MAP, RENAME_LABELS_MAP


class vLLMCheck(OpenMetricsBaseCheckV2):
Expand All @@ -14,7 +14,10 @@ class vLLMCheck(OpenMetricsBaseCheckV2):

def get_default_config(self):
return {
'metrics': [METRIC_MAP],
'metrics': [
METRIC_MAP,
RAY_METRIC_MAP,
],
"rename_labels": RENAME_LABELS_MAP,
}

Expand Down
23 changes: 23 additions & 0 deletions vllm/datadog_checks/vllm/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,29 @@
'vllm:avg_generation_throughput_toks_per_s': 'avg.generation_throughput.toks_per_s',
}

# RAY_METRIC_MAP for metrics exposed with ray_vllm. prefix when vllm run as a library
RAY_METRIC_MAP = {
'ray_vllm:cache_config_info': 'cache_config_info',
'ray_vllm:num_requests_running': 'num_requests.running',
'ray_vllm:num_requests_waiting': 'num_requests.waiting',
'ray_vllm:num_requests_swapped': 'num_requests.swapped',
'ray_vllm:gpu_cache_usage_perc': 'gpu_cache_usage_perc',
'ray_vllm:cpu_cache_usage_perc': 'cpu_cache_usage_perc',
'ray_vllm:num_preemptions': 'num_preemptions',
'ray_vllm:prompt_tokens': 'prompt_tokens',
'ray_vllm:generation_tokens': 'generation_tokens',
'ray_vllm:time_to_first_token_seconds': 'time_to_first_token.seconds',
'ray_vllm:time_per_output_token_seconds': 'time_per_output_token.seconds',
'ray_vllm:e2e_request_latency_seconds': 'e2e_request_latency.seconds',
'ray_vllm:request_prompt_tokens': 'request.prompt_tokens',
'ray_vllm:request_generation_tokens': 'request.generation_tokens',
'ray_vllm:request_params_best_of': 'request.params.best_of',
'ray_vllm:request_params_n': 'request.params.n',
'ray_vllm:request_success': 'request.success',
'ray_vllm:avg_prompt_throughput_toks_per_s': 'avg.prompt.throughput.toks_per_s',
'ray_vllm:avg_generation_throughput_toks_per_s': 'avg.generation_throughput.toks_per_s',
}

RENAME_LABELS_MAP = {
'version': 'python_version',
}
5 changes: 5 additions & 0 deletions vllm/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ def get_fixture_path(filename):
"tags": ['test:test'],
}

MOCKED_INSTANCE_RAY = {
"openmetrics_endpoint": f"http://{HOST}:{PORT}/metrics_prefix",
"tags": ['test:test'],
}

MOCKED_VERSION_ENDPOINT = f"http://{HOST}:{PORT}/version"

COMPOSE_FILE = os.path.join(HERE, 'docker', 'docker-compose.yaml')
Expand Down
12 changes: 9 additions & 3 deletions vllm/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,29 @@
from datadog_checks.dev import docker_run
from datadog_checks.dev.conditions import CheckDockerLogs, CheckEndpoints

from .common import COMPOSE_FILE, MOCKED_INSTANCE, MOCKED_VERSION_ENDPOINT
from .common import COMPOSE_FILE, MOCKED_INSTANCE, MOCKED_INSTANCE_RAY, MOCKED_VERSION_ENDPOINT


@pytest.fixture(scope='session')
def dd_environment():
compose_file = COMPOSE_FILE
conditions = [
CheckDockerLogs(identifier='caddy', patterns=['server running']),
CheckEndpoints(MOCKED_INSTANCE["openmetrics_endpoint"]),
CheckEndpoints(MOCKED_INSTANCE['openmetrics_endpoint']),
CheckEndpoints(MOCKED_VERSION_ENDPOINT),
CheckEndpoints(MOCKED_INSTANCE_RAY['openmetrics_endpoint']),
]
with docker_run(compose_file, conditions=conditions):
yield {
'instances': [MOCKED_INSTANCE],
'instances': [MOCKED_INSTANCE, MOCKED_INSTANCE_RAY], # Include both instances
}


@pytest.fixture
def instance():
return copy.deepcopy(MOCKED_INSTANCE)


@pytest.fixture
def ray_instance():
return copy.deepcopy(MOCKED_INSTANCE_RAY)
1 change: 1 addition & 0 deletions vllm/tests/docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ services:
- ./Caddyfile:/etc/caddy/Caddyfile
- ../fixtures/vllm_metrics.txt:/usr/share/caddy/metrics
- ../fixtures/vllm_version.json:/usr/share/caddy/version
- ../fixtures/ray_vllm_metrics.txt:/usr/share/caddy/metrics_prefix
197 changes: 197 additions & 0 deletions vllm/tests/fixtures/ray_vllm_metrics.txt

Large diffs are not rendered by default.

47 changes: 37 additions & 10 deletions vllm/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,37 @@
def test_check_vllm(dd_run_check, aggregator, datadog_agent, instance):
check = vLLMCheck("vLLM", {}, [instance])
check.check_id = "test:123"
with mock.patch(
'requests.get',
side_effect=[
MockResponse(file_path=get_fixture_path("vllm_metrics.txt")),
MockResponse(file_path=get_fixture_path("vllm_version.json")),
],
):

mock_responses = [
MockResponse(file_path=get_fixture_path("vllm_metrics.txt")),
MockResponse(file_path=get_fixture_path("vllm_version.json")),
]

with mock.patch('requests.get', side_effect=mock_responses):
dd_run_check(check)

for metric in METRICS_MOCK:
aggregator.assert_metric(metric)
aggregator.assert_metric_has_tag(metric, "test:test")

aggregator.assert_all_metrics_covered()
aggregator.assert_metrics_using_metadata(get_metadata_metrics())
aggregator.assert_service_check("vllm.openmetrics.health", ServiceCheck.OK)

version_metadata = _get_version_metadata("0.4.3")
datadog_agent.assert_metadata("test:123", version_metadata)


def test_check_vllm_w_ray_prefix(dd_run_check, aggregator, datadog_agent, ray_instance):
check = vLLMCheck("vLLM", {}, [ray_instance])
check.check_id = "test:123"

mock_responses = [
MockResponse(file_path=get_fixture_path("ray_vllm_metrics.txt")),
MockResponse(file_path=get_fixture_path("vllm_version.json")),
]

with mock.patch('requests.get', side_effect=mock_responses):
dd_run_check(check)

for metric in METRICS_MOCK:
Expand All @@ -34,16 +58,19 @@ def test_check_vllm(dd_run_check, aggregator, datadog_agent, instance):
aggregator.assert_metrics_using_metadata(get_metadata_metrics())
aggregator.assert_service_check("vllm.openmetrics.health", ServiceCheck.OK)

raw_version = "0.4.3"
version_metadata = _get_version_metadata("0.4.3")
datadog_agent.assert_metadata("test:123", version_metadata)


def _get_version_metadata(raw_version):
major, minor, patch = raw_version.split(".")
version_metadata = {
return {
"version.scheme": "semver",
"version.major": major,
"version.minor": minor,
"version.patch": patch,
"version.raw": raw_version,
}
datadog_agent.assert_metadata("test:123", version_metadata)


def test_emits_critical_openemtrics_service_check_when_service_is_down(
Expand Down

0 comments on commit f082dc1

Please sign in to comment.