From 4666afe7020bb5a9017e95ce4da26142378f6a68 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Mon, 2 Dec 2024 10:26:23 -0800 Subject: [PATCH 1/3] [shortfin] Upgrade package build dockerfile to manylinux_2_28. (#627) Progress on https://github.com/nod-ai/shark-ai/issues/130. The manylinux2014 image includes gcc 10.2.1 by default while manylinux_2_28 includes gcc 12.2.1. At one point we had warnings/errors building on the newer gcc version, but that is no longer the case. With the new Rust dependency coming from https://github.com/nod-ai/shark-ai/pull/610, we will likely want to revive https://github.com/nod-ai/base-docker-images/blob/main/dockerfiles/manylinux_x86_64.Dockerfile, add more dependencies there, then switch from the upstream `quay.io/...` image to that `ghcr.io/nod-ai/...` image. Tested locally with `OUTPUT_DIR="/tmp/wheelhouse" sudo -E ./build_tools/build_linux_package.sh`. If the nightly package build fails for some reason we can easily revert this. --- shortfin/build_tools/build_linux_package.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/shortfin/build_tools/build_linux_package.sh b/shortfin/build_tools/build_linux_package.sh index 9f7388119..afaa1e9fb 100755 --- a/shortfin/build_tools/build_linux_package.sh +++ b/shortfin/build_tools/build_linux_package.sh @@ -36,11 +36,8 @@ REPO_ROOT="$(cd "$THIS_DIR"/../../ && pwd)" SCRIPT_NAME="$(basename $0)" ARCH="$(uname -m)" -# TODO(#130): Update to manylinux_2_28, upstream or a fork -# * upstream uses a version of gcc that has build warnings/errors -# * https://github.com/nod-ai/base-docker-images is a bit out of date but can include a recent clang -# MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}" -MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux2014_${ARCH}:latest}" +# Note: we can switch to https://github.com/nod-ai/base-docker-images as needed for extra deps. +MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}" PYTHON_VERSIONS="${OVERRIDE_PYTHON_VERSIONS:-cp311-cp311 cp312-cp312 cp313-cp313}" OUTPUT_DIR="${OUTPUT_DIR:-${THIS_DIR}/wheelhouse}" From b89814e29f935988f409ff58c81bfd4deb88b43b Mon Sep 17 00:00:00 2001 From: Daniel Garvey <34486624+dan-garvey@users.noreply.github.com> Date: Mon, 2 Dec 2024 13:02:21 -0600 Subject: [PATCH 2/3] [sharktank] Fix flaxy quantizer test (#631) Forcibly assigns type to avoid test flake when it selects bfloat16 for unknown reasons. Signed-off-by: dan --- sharktank/tests/types/quantizers_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sharktank/tests/types/quantizers_test.py b/sharktank/tests/types/quantizers_test.py index b712da06a..a5fbc2b95 100644 --- a/sharktank/tests/types/quantizers_test.py +++ b/sharktank/tests/types/quantizers_test.py @@ -72,7 +72,9 @@ def testPerAxisRoundtrip(self): ) ssq = self._roundtrip(ssq, "_ssq") self.assertEqual(ssq.axis, 1) - torch.testing.assert_close(ssq.scale, torch.tensor([0.2, 0.4, 0.8])) + torch.testing.assert_close( + ssq.scale, torch.tensor([0.2, 0.4, 0.8], dtype=torch.float32) + ) torch.testing.assert_close(ssq.reciprocal_scale, torch.tensor([5.0, 2.5, 1.25])) self.assertIs(ssq.dtype, torch.float16) From 8cd3f850f0a7864d9e004483fcf4bbea08d3e21a Mon Sep 17 00:00:00 2001 From: "Xida Ren (Cedar)" Date: Mon, 2 Dec 2024 14:14:27 -0500 Subject: [PATCH 3/3] Implement PageAllocation as a handle into a PagedAttentionCache, allowing publishing and releasing an allocation via handle rather than cache (#608) Deinitialization looks wonky for now. Will test extensively to get deinit right once I merge #600 Closes #607 --- .../shortfin_apps/llm/components/__init__.py | 0 .../llm/components/kvcache/__init__.py | 0 .../kvcache/base_attention_cache.py | 86 +++++++--- .../shortfin_apps/llm/components/messages.py | 37 ++-- .../shortfin_apps/llm/components/service.py | 59 ++++--- .../kvcache/base_attention_cache_test.py | 159 ++++++++++++++++++ 6 files changed, 266 insertions(+), 75 deletions(-) create mode 100644 shortfin/python/shortfin_apps/llm/components/__init__.py create mode 100644 shortfin/python/shortfin_apps/llm/components/kvcache/__init__.py create mode 100644 shortfin/tests/apps/llm/components/kvcache/base_attention_cache_test.py diff --git a/shortfin/python/shortfin_apps/llm/components/__init__.py b/shortfin/python/shortfin_apps/llm/components/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/shortfin/python/shortfin_apps/llm/components/kvcache/__init__.py b/shortfin/python/shortfin_apps/llm/components/kvcache/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/shortfin/python/shortfin_apps/llm/components/kvcache/base_attention_cache.py b/shortfin/python/shortfin_apps/llm/components/kvcache/base_attention_cache.py index 0007000bc..73134903c 100644 --- a/shortfin/python/shortfin_apps/llm/components/kvcache/base_attention_cache.py +++ b/shortfin/python/shortfin_apps/llm/components/kvcache/base_attention_cache.py @@ -8,9 +8,66 @@ Base class for kv caches. """ -from typing import List +from typing import List, Iterable, Protocol from .page_pool import PageInfo import math +from abc import ABC, abstractmethod +from .page_pool import PagePool + +# logging +import logging + +logger = logging.getLogger(__name__) + +# exception for when cache allocation failed +class CacheAllocationFailure(Exception): + pass + + +class PageAllocation(ABC): + """Abstract base class for page allocations in the cache.""" + + @property + @abstractmethod + def pages(self) -> List[PageInfo]: + """Returns the list of pages that were allocated.""" + pass + + @abstractmethod + def publish_pages(self, up_to_page_index) -> None: + """Makes pages[0:up_to_page_index] available to other requests.""" + pass + + @abstractmethod + def release_pages(self) -> None: + """Releases the allocation's reference to pages.""" + pass + + +class BasePageAttentionCacheAllocation(PageAllocation): + """Represents a page allocation in the cache.""" + + def __init__(self, pages: Iterable[PageInfo], cache: "BasePagedAttentionCache"): + self._pages = tuple(pages) + self._cache = cache + self._is_released = False + + @property + def pages(self) -> List[PageInfo]: + return list(self._pages) + + def publish_pages(self, up_to_page_index) -> None: + pass + + def release_pages(self) -> None: + if self._is_released: + logger.warning("Releasing already-released allocation") + return + self._cache.page_pool.release_pages(self._pages) + self._is_released = True + + def __rerp__(self) -> str: + return f"BasePageAttentionCacheAllocation(pages={self._pages}, cache={self._cache})" class BasePagedAttentionCache: @@ -33,13 +90,13 @@ class BasePagedAttentionCache: - Reference counting prevents eviction of in-use pages """ - def __init__(self, page_pool, tokens_per_page): + def __init__(self, page_pool: PagePool, tokens_per_page: int): self.page_pool = page_pool self.tokens_per_page = tokens_per_page def acquire_pages_for_tokens( self, tokens: List[int], extra_token_slots: int = 1 - ) -> tuple[list[PageInfo], int]: + ) -> PageAllocation: """ Given a list of tokens, return a list of pages and a start position to continue generation from. @@ -57,24 +114,7 @@ def acquire_pages_for_tokens( pages_needed = math.ceil(token_count / self.tokens_per_page) pages = self.page_pool.acquire_free_pages(pages_needed) - n_cached_tokens = 0 - - return pages, n_cached_tokens - - def publish_pages(self, tokens, pages) -> None: - """ - Given a list of tokens and pages containing KV corresponding to these tokens, make these pages available to other requests. - - Associates the tokens with the pages, and mark them as done writing. - - It is assumed that hereafter, the calling request will not modify these pages, at least not the positions [0:len(tokens)]. - """ - - pass # the base implementation doesn't cache unfinished requests. + if pages is None: + raise CacheAllocationFailure() - def release_pages(self, tokens, pages): - """ - Decrement reference count for these pages. When reference count is zero, they will be elegible for eviction. - """ - # in the base implementation, the pages can be owned by 1 request max, so they can be instantly release - self.page_pool.release_pages(pages) + return BasePageAttentionCacheAllocation(pages, cache=self) diff --git a/shortfin/python/shortfin_apps/llm/components/messages.py b/shortfin/python/shortfin_apps/llm/components/messages.py index c3e6fe34b..c03900782 100644 --- a/shortfin/python/shortfin_apps/llm/components/messages.py +++ b/shortfin/python/shortfin_apps/llm/components/messages.py @@ -9,7 +9,7 @@ import shortfin as sf import shortfin.array as sfnp -from .kvcache.base_attention_cache import BasePagedAttentionCache +from .kvcache.base_attention_cache import BasePagedAttentionCache, PageAllocation from .kvcache.page_pool import PageInfo @@ -43,7 +43,7 @@ def __init__(self, phase: InferencePhase, input_token_ids: list[int]): # Cache pages that have been locked for this request. self._cache: BasePagedAttentionCache | None = None - self.locked_pages: list[PageInfo] | None = None + self.allocation: PageAllocation | None = None def reset(self, phase: InferencePhase): """Resets all per request state in preparation for an subsequent execution.""" @@ -52,35 +52,22 @@ def reset(self, phase: InferencePhase): self.return_all_logits = False self.return_host_array = True self.result_logits = None + self.allocation.release_pages() + self.allocation = None def cache_page_indices(self, max_len: int) -> list[int]: - if not self.locked_pages: + if not self.allocation: return [] - indices = [p.index for p in self.locked_pages] - if len(indices) > max_len: - return indices[0:max_len] + indices = [p.index for p in self.allocation.pages[:max_len]] return indices + def publish_allocated_pages(self, up_to_page_index: int): + assert self.allocation + self.allocation.publish_pages(up_to_page_index) + def free_cache_pages(self): - cache = self._cache - if cache: - pages = self.locked_pages - self._cache = None - self.locked_pages = None - cache.release_pages(self.input_token_ids, pages) - - def lock_initial_cache_pages( - self, cache: BasePagedAttentionCache, pages: list[PageInfo] - ): - assert not self._cache - self._cache = cache - self.locked_pages = pages - - def lock_new_cache_pages( - self, cache: BasePagedAttentionCache, pages: list[PageInfo] - ): - assert self._cache is cache - self.locked_pages.extend(pages) + if self.allocation: + self.allocation.release_pages() class StrobeMessage(sf.Message): diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py index 8d3cc1424..2f942aec7 100644 --- a/shortfin/python/shortfin_apps/llm/components/service.py +++ b/shortfin/python/shortfin_apps/llm/components/service.py @@ -11,8 +11,12 @@ import shortfin as sf import shortfin.array as sfnp -from .kvcache.base_attention_cache import BasePagedAttentionCache -from .kvcache.page_pool import PagePoolConfig, PagePool +from .kvcache.base_attention_cache import ( + BasePagedAttentionCache, + CacheAllocationFailure, + PageAllocation, +) +from .kvcache.page_pool import PagePoolConfig, PagePool, PageInfo from .config_struct import ModelParams from .manager import SystemManager from .messages import InferenceExecRequest, InferencePhase, StrobeMessage @@ -229,16 +233,17 @@ def board_prefills(self, cache: BasePagedAttentionCache): len(prefill_request.input_token_ids) / self.page_seq_stride ) # allocate kv cache pages - pages, cache_hit_prefix_length = cache.acquire_pages_for_tokens( - prefill_request.input_token_ids, - extra_token_slots=0, # prefill needs no extra kvcache slots to write to - ) - if pages is None: + try: + allocation = cache.acquire_pages_for_tokens( + prefill_request.input_token_ids, + extra_token_slots=0, # prefill needs no extra kvcache slots to write to + ) + except CacheAllocationFailure: logger.debug("Cannot fulfill request for %d pages", needed_pages) continue - else: - logger.debug("Allocated %d cache pages to request", len(pages)) - prefill_request.lock_initial_cache_pages(cache, pages) + logger.debug(f"Successfully acquired allocation: {allocation}") + prefill_request.free_cache_pages() + prefill_request.allocation = allocation # Can flight this request. exec_process.exec_requests.append(prefill_request) @@ -266,26 +271,20 @@ def board_decodes(self, cache: BasePagedAttentionCache): if len(exec_process.exec_requests) >= self.ideal_batch_size: break incoming_token_count = len(decode_request.input_token_ids) - needed_pages = math.ceil( - (decode_request.start_position + incoming_token_count) - / self.page_seq_stride - ) - if needed_pages > len(decode_request.locked_pages): - # allocate kv cache pages - pages, cache_hit_prefix_length = cache.acquire_pages_for_tokens( + + try: + allocation = cache.acquire_pages_for_tokens( decode_request.input_token_ids, extra_token_slots=1, # need 1 extra slot to write result. ) - if pages is None: - logger.debug( - "Cannot fulfill decode request for %d pages", needed_pages - ) - continue - else: - logger.debug( - "Allocated %d cache pages to decode request", len(pages) - ) - decode_request.lock_new_cache_pages(cache, pages) + except CacheAllocationFailure: + logger.debug( + "Cannot fulfill request for %d tokens", + len(decode_request.input_token_ids), + ) + + decode_request.free_cache_pages() + decode_request.allocation = allocation # Can flight this request. exec_process.exec_requests.append(decode_request) @@ -438,6 +437,12 @@ async def run(self): # Invoke. Logits are of shape [bs, bsl, d]. (logits,) = await fn(*args, fiber=self.fiber) + # publish cache pages + for r in self.exec_requests: + total_tokens = r.start_position + len(r.input_token_ids) + number_of_complete_pages = total_tokens // seq_stride + r.publish_allocated_pages(number_of_complete_pages) + # Return results. for i in range(req_count): req = self.exec_requests[i] diff --git a/shortfin/tests/apps/llm/components/kvcache/base_attention_cache_test.py b/shortfin/tests/apps/llm/components/kvcache/base_attention_cache_test.py new file mode 100644 index 000000000..113da6912 --- /dev/null +++ b/shortfin/tests/apps/llm/components/kvcache/base_attention_cache_test.py @@ -0,0 +1,159 @@ +import pytest +import threading +import queue +import random +import time +from collections import defaultdict +from unittest.mock import Mock +from dataclasses import dataclass +from typing import List, Optional, Set + +from shortfin_apps.llm.components.kvcache.base_attention_cache import ( + BasePagedAttentionCache, + BasePageAttentionCacheAllocation, + CacheAllocationFailure, +) +from shortfin_apps.llm.components.kvcache.page_pool import PagePool, PageInfo + +TEST_PAGE_SIZE = 16 +TEST_POOL_CAPACITY = 10 + + +class MockPagePool(PagePool): + def __init__(self, total_pages: int): + self._queue = queue.Queue() + for i in range(total_pages): + page = PageInfo(index=i, pool=self, token_offset=0, token_count=0) + self._queue.put(page) + + def acquire_free_pages(self, count: int) -> List[PageInfo]: + try: + return [self._queue.get_nowait() for _ in range(count)] + except queue.Empty: + return None + + def release_pages(self, pages): + for page in pages: + self._queue.put(page) + + +@pytest.fixture +def page_pool(): + return MockPagePool(total_pages=TEST_POOL_CAPACITY) + + +@pytest.fixture +def cache(page_pool): + return BasePagedAttentionCache(page_pool=page_pool, tokens_per_page=TEST_PAGE_SIZE) + + +# fmt: off +@pytest.mark.parametrize( + "tokens,expected_pages,case_name", + [ # Tokens Pages Case Name + ([], 0, "empty_token_list"), + (list(range(TEST_PAGE_SIZE // 2)), 1, "partial_page"), + (list(range(TEST_PAGE_SIZE)), 1, "exact_page"), + (list(range(TEST_PAGE_SIZE + 1)), 2, "just_over_one_page"), + (list(range(TEST_PAGE_SIZE * 2)), 2, "multiple_exact_pages"), + (list(range(TEST_PAGE_SIZE * 2 + 1)), 3, "multiple_pages_with_remainder"), + (list(range(TEST_PAGE_SIZE * 3)), 3, "three_exact_pages"), + (list(range(1)), 1, "single_token"), + (list(range(TEST_PAGE_SIZE - 1)), 1, "almost_full_page"), + ], +) +# fmt: on +def test_allocation_sizes(cache, tokens, expected_pages, case_name): + allocation = cache.acquire_pages_for_tokens(tokens) + pages = allocation.pages + assert len(pages) == expected_pages, f"Failed for case: {case_name}" + allocation.release_pages() + + +# fmt: off +@pytest.mark.parametrize( + "num_workers,pages_per_worker,expect_failure,case_name", + [ # Workers Pages Failure Case name + (2, 1, False, "basic_concurrent"), # Basic concurrent access + (5, 1, False, "high_concurrency"), # Higher concurrency, single page + (3, 2, False, "multi_page"), # Multiple pages per worker + (2, 3, False, "more_pages"), # More pages than workers, within capacity + (TEST_POOL_CAPACITY, 1, False, "max_capacity"), # Max capacity single pages + (TEST_POOL_CAPACITY // 2, 2, False, "max_capacity_multi"), # Max capacity multiple pages + (4, 3, True , "exceeds_total"), # 12 pages needed, exceeds capacity + (TEST_POOL_CAPACITY + 1, 1, True , "exceeds_workers"), # More workers than capacity + (TEST_POOL_CAPACITY // 2, 3, True , "exceeds_with_multi"), # Exceeds capacity with multiple pages + ], +) +# fmt: on +def test_concurrent_page_allocation( + cache, + num_workers, + pages_per_worker, + expect_failure, + case_name, +): + allocated_pages = defaultdict(set) + errors = [] + allocations = [] + + def worker(worker_id: int): + try: + tokens = list(range(TEST_PAGE_SIZE * pages_per_worker)) + allocation = cache.acquire_pages_for_tokens(tokens) + allocations.append(allocation) + allocated_pages[worker_id] = {page.index for page in allocation.pages} + time.sleep(random.uniform(0.001, 0.01)) + except CacheAllocationFailure as e: + errors.append(e) + except Exception as e: + pytest.fail(f"Unexpected error: {e}") + + threads = [threading.Thread(target=worker, args=(i,)) for i in range(num_workers)] + + for t in threads: + t.start() + for t in threads: + t.join() + + if expect_failure: + assert len(errors) > 0, "Expected at least one CacheAllocationFailure" + else: + assert not errors, f"Workers encountered errors: {errors}" + for worker_id, pages in allocated_pages.items(): + assert ( + len(pages) == pages_per_worker + ), f"Worker {worker_id} got {len(pages)} pages, expected {pages_per_worker}" + + all_pages = set() + for pages in allocated_pages.values(): + assert not ( + pages & all_pages + ), f"Found duplicate page allocation: {pages & all_pages}" + all_pages.update(pages) + + for allocation in allocations: + allocation.release_pages() + + +@pytest.mark.parametrize( + "total_pages_needed", + [ + TEST_POOL_CAPACITY + 1, # Just over capacity + TEST_POOL_CAPACITY * 2, # Double capacity + ], +) +def test_allocation_failure_when_exhausted(cache, total_pages_needed): + successful_allocations = [] + + try: + tokens = list(range(TEST_PAGE_SIZE * total_pages_needed)) + allocation = cache.acquire_pages_for_tokens(tokens) + successful_allocations.append(allocation) + except CacheAllocationFailure as e: + pass + else: + pytest.fail("Expected CacheAllocationFailure was not raised") + finally: + for alloc in successful_allocations: + alloc.release_pages()