-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Report histogram metrics to Triton metrics server #56
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
from typing import Dict, Union | ||
from typing import Dict, List, Union | ||
|
||
import triton_python_backend_utils as pb_utils | ||
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase | ||
|
@@ -46,6 +46,16 @@ def __init__(self, labels): | |
description="Number of generation tokens processed.", | ||
kind=pb_utils.MetricFamily.COUNTER, | ||
) | ||
self.histogram_time_to_first_token_family = pb_utils.MetricFamily( | ||
name="vllm:time_to_first_token_seconds", | ||
description="Histogram of time to first token in seconds.", | ||
kind=pb_utils.MetricFamily.HISTOGRAM, | ||
) | ||
self.histogram_time_per_output_token_family = pb_utils.MetricFamily( | ||
name="vllm:time_per_output_token_seconds", | ||
description="Histogram of time per output token in seconds.", | ||
kind=pb_utils.MetricFamily.HISTOGRAM, | ||
) | ||
|
||
# Initialize metrics | ||
# Iteration stats | ||
|
@@ -55,6 +65,51 @@ def __init__(self, labels): | |
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( | ||
labels=labels | ||
) | ||
# Use the same bucket boundaries from vLLM sample metrics. | ||
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 | ||
self.histogram_time_to_first_token = ( | ||
self.histogram_time_to_first_token_family.Metric( | ||
labels=labels, | ||
buckets=[ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Buckets here are just example from vLLM repo metrics.py. I think we want to let user define the interval buckets. Also good for the unittest since data observed are pretty small when prompts are simply. What is the best practice to allow customizable buckets? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @oandreeva-nv Explanation to comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think, if we ship There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated |
||
0.001, | ||
0.005, | ||
0.01, | ||
0.02, | ||
0.04, | ||
0.06, | ||
0.08, | ||
0.1, | ||
0.25, | ||
0.5, | ||
0.75, | ||
1.0, | ||
2.5, | ||
5.0, | ||
7.5, | ||
10.0, | ||
oandreeva-nv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
], | ||
) | ||
) | ||
self.histogram_time_per_output_token = ( | ||
self.histogram_time_per_output_token_family.Metric( | ||
labels=labels, | ||
buckets=[ | ||
0.01, | ||
0.025, | ||
0.05, | ||
0.075, | ||
0.1, | ||
0.15, | ||
0.2, | ||
0.3, | ||
0.4, | ||
0.5, | ||
0.75, | ||
1.0, | ||
2.5, | ||
], | ||
) | ||
) | ||
|
||
|
||
class VllmStatLogger(VllmStatLoggerBase): | ||
|
@@ -82,6 +137,19 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: | |
if data != 0: | ||
counter.increment(data) | ||
|
||
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: | ||
"""Convenience function for logging list to histogram. | ||
Args: | ||
histogram: A histogram metric instance. | ||
data: A list of int or float data to observe into the histogram metric. | ||
Returns: | ||
None | ||
""" | ||
for datum in data: | ||
histogram.observe(datum) | ||
|
||
def log(self, stats: VllmStats) -> None: | ||
"""Report stats to Triton metrics server. | ||
|
@@ -97,3 +165,10 @@ def log(self, stats: VllmStats) -> None: | |
self._log_counter( | ||
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter | ||
) | ||
self._log_histogram( | ||
self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter | ||
) | ||
self._log_histogram( | ||
self.metrics.histogram_time_per_output_token, | ||
stats.time_per_output_tokens_iter, | ||
) |
Check notice
Code scanning / CodeQL
Imprecise assert Note