From 29b5b8ddd9272e5eb97778ba0a6a8b38b46326c7 Mon Sep 17 00:00:00 2001
From: Richard Edgar <riedgar@microsoft.com>
Date: Wed, 20 Mar 2024 17:44:46 -0400
Subject: [PATCH] Add Llama to model test matrix (#703)

Working to get Llama models into the test matrix.

---------

Co-authored-by: Harsha-Nori <harsha.nori@live.com>
---
 .github/workflows/unit_tests.yml |  5 ++++-
 tests/conftest.py                | 15 +++++++++++----
 tests/library/test_gen.py        | 12 +++---------
 tests/utils.py                   | 14 +++++++++++++-
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 2c7f0c063..14018c7e1 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -14,7 +14,7 @@ jobs:
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-        model: ["gpt2cpu", "phi2cpu"]
+        model: ["gpt2cpu", "phi2cpu", "hfllama7b"]
 
     runs-on: ${{ matrix.os }}
     steps:
@@ -30,6 +30,9 @@ jobs:
           pip install pytest
           pip install -e .[test]
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Install model-specific dependencies
+        run: |
+          pip install llama-cpp-python
       - name: Run tests (except server)
         run: |
           pytest --cov=guidance --cov-report=xml --cov-report=term-missing --selected_model ${{ matrix.model }} -m "not server" ./tests/
diff --git a/tests/conftest.py b/tests/conftest.py
index 56c7beee9..ef949db35 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,6 +9,10 @@
     "phi2cpu": dict(
         name="transformers:microsoft/phi-2", kwargs={"trust_remote_code": True}
     ),
+    "hfllama7b": dict(
+        name="huggingface_hubllama:TheBloke/Llama-2-7B-GGUF:llama-2-7b.Q5_K_M.gguf",
+        kwargs={"verbose": True},
+    ),
     "gpt2gpu": dict(name="transformers:gpt2", kwargs={"device_map": "cuda:0"}),
     "phi2gpu": dict(
         name="transformers:microsoft/phi-2",
@@ -28,7 +32,12 @@ def pytest_addoption(parser):
 
 
 @pytest.fixture(scope="session")
-def selected_model(pytestconfig) -> models.Model:
+def selected_model_name(pytestconfig) -> str:
+    return pytestconfig.getoption("selected_model")
+
+
+@pytest.fixture(scope="session")
+def selected_model(selected_model_name: str) -> models.Model:
     """Get a concrete model for tests
 
     This fixture is for tests which are supposed
@@ -41,9 +50,7 @@ def selected_model(pytestconfig) -> models.Model:
     controlled by the '--selected_model' command
     line argument to pytest.
     """
-    model_key = pytestconfig.getoption("selected_model")
-
-    model_info = AVAILABLE_MODELS[model_key]
+    model_info = AVAILABLE_MODELS[selected_model_name]
 
     model = get_model(model_info["name"], **(model_info["kwargs"]))
     return model
diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py
index 09032d216..fdd719291 100644
--- a/tests/library/test_gen.py
+++ b/tests/library/test_gen.py
@@ -52,10 +52,6 @@ def test_unicode(selected_model):
     lm + f'Step {i}:' + gen('steps', list_append=True, stop=['\nStep', '\n\n', '\nAnswer'], temperature=0.7, max_tokens=20) + '\n'
 
 def test_unicode2(selected_model):
-    # Does not work with Phi2
-    model_type = type(selected_model.engine.model_obj).__name__
-    if model_type == "PhiForCausalLM":
-        pytest.xfail("See https://github.com/guidance-ai/guidance/issues/681")
     lm = selected_model
     prompt = 'Janet’s ducks lay 16 eggs per day'
     lm +=  prompt + gen(max_tokens=10)
@@ -163,11 +159,9 @@ def test_various_regexes(selected_model: models.Model, prompt: str, pattern: str
     # note we can't just test any regex pattern like this, we need them to have finished in less than 40 tokens
     assert re.match(pattern, lm2["test"], re.DOTALL) is not None
 
-def test_long_prompt(selected_model):
-    # Does not work with Phi2
-    model_type = type(selected_model.engine.model_obj).__name__
-    if model_type == "PhiForCausalLM":
-        pytest.xfail("See https://github.com/guidance-ai/guidance/issues/681")
+def test_long_prompt(selected_model, selected_model_name):
+    if selected_model_name == "hfllama7b":
+        pytest.xfail("Insufficient context window in model")
     lm = selected_model
     prompt = '''Question: Legoland has 5 kangaroos for each koala. If Legoland has 180 kangaroos, how many koalas and kangaroos are there altogether?
 Let's think step by step, and then write the answer:
diff --git a/tests/utils.py b/tests/utils.py
index cde04e1c5..c01349647 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,8 +1,9 @@
-import json
 import os
 
 from typing import Any
 
+from huggingface_hub import hf_hub_download
+
 import pytest
 
 import guidance
@@ -18,6 +19,11 @@ def get_model(model_name, caching=False, **kwargs):
         return get_transformers_model(model_name[13:], caching, **kwargs)
     elif model_name.startswith("llama_cpp:"):
         return get_llama_cpp_model(model_name[10:], caching, **kwargs)
+    elif model_name.startswith("huggingface_hubllama"):
+        name_parts = model_name.split(":")
+        return get_llama_hugging_face_model(
+            repo_id=name_parts[1], filename=name_parts[2], **kwargs
+        )
 
 
 def get_openai_model(model_name, caching=False, **kwargs):
@@ -35,6 +41,12 @@ def get_openai_model(model_name, caching=False, **kwargs):
     return lm
 
 
+def get_llama_hugging_face_model(repo_id: str, filename: str, **kwargs):
+    downloaded_file = hf_hub_download(repo_id=repo_id, filename=filename)
+    lm = guidance.models.LlamaCpp(downloaded_file, **kwargs)
+    return lm
+
+
 transformers_model_cache = {}