Refactor

hupe1980 · Apr 4, 2024 · 43823f0 · 43823f0
1 parent 45c6f35
commit 43823f0
Show file tree

Hide file tree

Showing 21 changed files with 133 additions and 100 deletions.
diff --git a/aisploit/classifier/huggingface/hub_prompt_injection_identifier.py b/aisploit/classifier/huggingface/hub_prompt_injection_identifier.py
@@ -1,4 +1,5 @@
 from ...core import BaseClassifier
 
+
 class HubPromptIjectionIdentifier(BaseClassifier):
     pass
diff --git a/aisploit/classifier/huggingface/pipeline_prompt_injection_identifier.py b/aisploit/classifier/huggingface/pipeline_prompt_injection_identifier.py
@@ -6,6 +6,7 @@
 
 from ...core import BaseClassifier, Score
 
+
 class PipelinePromptInjectionIdentifier(BaseClassifier):
     def __init__(
         self,

diff --git a/aisploit/converter/__init__.py b/aisploit/converter/__init__.py
@@ -1,9 +1,13 @@
 from .base64 import Base64Converter
-from .keyboard_typo import KeyboardTypoConverter, KEYBOARD_NEIGHBORS_QWERTY, KEYBOARD_NEIGHBORS_QWERTZ
+from .keyboard_typo import (
+    KeyboardTypoConverter,
+    KEYBOARD_NEIGHBORS_QWERTY,
+    KEYBOARD_NEIGHBORS_QWERTZ,
+)
 
 __all__ = [
     "Base64Converter",
     "KeyboardTypoConverter",
     "KEYBOARD_NEIGHBORS_QWERTY",
-    "KEYBOARD_NEIGHBORS_QWERTZ"
-]
+    "KEYBOARD_NEIGHBORS_QWERTZ",
+]
diff --git a/aisploit/converter/base64.py b/aisploit/converter/base64.py
@@ -2,6 +2,10 @@
 
 from ..core import BaseConverter
 
+
 class Base64Converter(BaseConverter):
     def convert(self, prompts: list[str]) -> list[str]:
-        return [base64.b64encode(prompt.encode("utf-8")).decode("utf-8") for prompt in prompts]
+        return [
+            base64.b64encode(prompt.encode("utf-8")).decode("utf-8")
+            for prompt in prompts
+        ]
diff --git a/aisploit/converter/keyboard_typo.py b/aisploit/converter/keyboard_typo.py
@@ -3,70 +3,67 @@
 from ..core import BaseConverter
 
 KEYBOARD_NEIGHBORS_QWERTZ = {
-    'q': ['w', 'a', 's'],
-    'w': ['q', 'e', 's', 'd'],
-    'e': ['w', 'r', 'd', 'f'],
-    'r': ['e', 't', 'f', 'g'],
-    't': ['r', 'z', 'g', 'h'],
-    'z': ['t', 'u', 'h', 'j'],
-    'u': ['z', 'i', 'j', 'k'],
-    'i': ['u', 'o', 'k', 'l'],
-    'o': ['i', 'p', 'l'],
-
-    'a': ['q', 's', 'y'],
-    's': ['a', 'w', 'e', 'd', 'x', 'y'],
-    'd': ['s', 'e', 'r', 'f', 'c', 'x'],
-    'f': ['d', 'r', 't', 'g', 'v', 'c'],
-    'g': ['f', 't', 'z', 'h', 'b', 'v'],
-    'h': ['g', 'z', 'u', 'j', 'n', 'b'],
-    'j': ['h', 'u', 'i', 'k', 'm', 'n'],
-    'k': ['j', 'i', 'o', 'l', 'm'],
-    'l': ['k', 'o', 'p'],
-
-    'y': ['a', 's', 'x'],
-    'x': ['z', 's', 'd', 'c'],
-    'c': ['x', 'd', 'f', 'v'],
-    'v': ['c', 'f', 'g', 'b'],
-    'b': ['v', 'g', 'h', 'n'],
-    'n': ['b', 'h', 'j', 'm'],
-    'm': ['n', 'j', 'k'],
+    "q": ["w", "a", "s"],
+    "w": ["q", "e", "s", "d"],
+    "e": ["w", "r", "d", "f"],
+    "r": ["e", "t", "f", "g"],
+    "t": ["r", "z", "g", "h"],
+    "z": ["t", "u", "h", "j"],
+    "u": ["z", "i", "j", "k"],
+    "i": ["u", "o", "k", "l"],
+    "o": ["i", "p", "l"],
+    "a": ["q", "s", "y"],
+    "s": ["a", "w", "e", "d", "x", "y"],
+    "d": ["s", "e", "r", "f", "c", "x"],
+    "f": ["d", "r", "t", "g", "v", "c"],
+    "g": ["f", "t", "z", "h", "b", "v"],
+    "h": ["g", "z", "u", "j", "n", "b"],
+    "j": ["h", "u", "i", "k", "m", "n"],
+    "k": ["j", "i", "o", "l", "m"],
+    "l": ["k", "o", "p"],
+    "y": ["a", "s", "x"],
+    "x": ["z", "s", "d", "c"],
+    "c": ["x", "d", "f", "v"],
+    "v": ["c", "f", "g", "b"],
+    "b": ["v", "g", "h", "n"],
+    "n": ["b", "h", "j", "m"],
+    "m": ["n", "j", "k"],
 }
 
 KEYBOARD_NEIGHBORS_QWERTY = {
-    'q': ['w', 'a', 's'],
-    'w': ['q', 'a', 's', 'd', 'e'],
-    'e': ['w', 's', 'd', 'f', 'r'],
-    'r': ['e', 'd', 'f', 'g', 't'],
-    't': ['r', 'f', 'g', 'h', 'y'],
-    'y': ['t', 'g', 'h', 'j', 'u'],
-    'u': ['y', 'h', 'j', 'k', 'i'],
-    'i': ['u', 'j', 'k', 'l', 'o'],
-    'o': ['i', 'k', 'l', 'p'],
-    'p': ['o', 'l'],
-
-    'a': ['q', 'w', 's', 'z'],
-    's': ['q', 'w', 'e', 'a', 'd', 'z', 'x'],
-    'd': ['w', 'e', 'r', 's', 'f', 'x', 'c'],
-    'f': ['e', 'r', 't', 'd', 'g', 'c', 'v'],
-    'g': ['r', 't', 'y', 'f', 'h', 'v', 'b'],
-    'h': ['t', 'y', 'u', 'g', 'j', 'b', 'n'],
-    'j': ['y', 'u', 'i', 'h', 'k', 'n', 'm'],
-    'k': ['u', 'i', 'o', 'j', 'l', 'm'],
-    'l': ['i', 'o', 'p', 'k'],
-
-    'z': ['a', 's', 'x'],
-    'x': ['z', 's', 'd', 'c'],
-    'c': ['x', 'd', 'f', 'v'],
-    'v': ['c', 'f', 'g', 'b'],
-    'b': ['v', 'g', 'h', 'n'],
-    'n': ['b', 'h', 'j', 'm'],
-    'm': ['n', 'j', 'k'],
+    "q": ["w", "a", "s"],
+    "w": ["q", "a", "s", "d", "e"],
+    "e": ["w", "s", "d", "f", "r"],
+    "r": ["e", "d", "f", "g", "t"],
+    "t": ["r", "f", "g", "h", "y"],
+    "y": ["t", "g", "h", "j", "u"],
+    "u": ["y", "h", "j", "k", "i"],
+    "i": ["u", "j", "k", "l", "o"],
+    "o": ["i", "k", "l", "p"],
+    "p": ["o", "l"],
+    "a": ["q", "w", "s", "z"],
+    "s": ["q", "w", "e", "a", "d", "z", "x"],
+    "d": ["w", "e", "r", "s", "f", "x", "c"],
+    "f": ["e", "r", "t", "d", "g", "c", "v"],
+    "g": ["r", "t", "y", "f", "h", "v", "b"],
+    "h": ["t", "y", "u", "g", "j", "b", "n"],
+    "j": ["y", "u", "i", "h", "k", "n", "m"],
+    "k": ["u", "i", "o", "j", "l", "m"],
+    "l": ["i", "o", "p", "k"],
+    "z": ["a", "s", "x"],
+    "x": ["z", "s", "d", "c"],
+    "c": ["x", "d", "f", "v"],
+    "v": ["c", "f", "g", "b"],
+    "b": ["v", "g", "h", "n"],
+    "n": ["b", "h", "j", "m"],
+    "m": ["n", "j", "k"],
 }
 
+
 class KeyboardTypoConverter(BaseConverter):
     def __init__(
-        self, 
-        *, 
+        self,
+        *,
         keyboard_neighbors=KEYBOARD_NEIGHBORS_QWERTY,
         typo_probability=0.1,
         random_state=None,
@@ -75,13 +72,16 @@ def __init__(
         self._typo_probability = typo_probability
         if random_state is not None:
             random.seed(random_state)
-     
+
     def convert(self, prompts: list[str]) -> list[str]:
         typoPrompts = []
         for prompt in prompts:
             typoPrompt = ""
             for char in prompt:
-                if random.random() < self._typo_probability and char.lower() in self._keyboard_neighbors:
+                if (
+                    random.random() < self._typo_probability
+                    and char.lower() in self._keyboard_neighbors
+                ):
                     # Replace the character with a random neighboring key
                     neighbor_keys = self._keyboard_neighbors[char.lower()]
                     typo_char = random.choice(neighbor_keys)
@@ -92,5 +92,5 @@ def convert(self, prompts: list[str]) -> list[str]:
                 typoPrompt += char
 
             typoPrompts.append(typoPrompt)
-            
-        return typoPrompts
+
+        return typoPrompts
diff --git a/aisploit/core/callbacks.py b/aisploit/core/callbacks.py
@@ -8,11 +8,13 @@ def on_redteam_attempt(self, attempt: int, prompt: str):
     def on_redteam_attempt_response(self, attempt: int, response: str):
         pass
 
+
 Callbacks = List[BaseCallbackHandler]
 
+
 class CallbackManager:
     def __init__(
-        self, 
+        self,
         *,
         id: str,
         callbacks: List[BaseCallbackHandler] = [],
@@ -26,4 +28,4 @@ def on_redteam_attempt(self, attempt: int, prompt: str):
 
     def on_redteam_attempt_response(self, attempt: int, response: str):
         for cb in self._callbacks:
-            cb.on_redteam_attempt_response(attempt, response)
+            cb.on_redteam_attempt_response(attempt, response)
diff --git a/aisploit/core/classifier.py b/aisploit/core/classifier.py
@@ -2,15 +2,17 @@
 from typing import Literal
 from dataclasses import dataclass
 
+
 @dataclass
 class Score:
     score_type: Literal["int", "float", "str", "bool"]
     score_value: int | float | str | bool
     score_description: str = ""
     score_explanation: str = ""
 
+
 class BaseClassifier(ABC):
     @abstractmethod
     def score_text(self, text: str) -> Score:
         """Score the text and return a Score object."""
-        raise NotImplementedError("score_text method not implemented")
+        raise NotImplementedError("score_text method not implemented")
diff --git a/aisploit/core/converter.py b/aisploit/core/converter.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 
+
 class BaseConverter(ABC):
     @abstractmethod
     def convert(self, prompts: list[str]) -> list[str]:
-            pass
+        pass
diff --git a/aisploit/core/job.py b/aisploit/core/job.py
@@ -1,5 +1,6 @@
 from abc import ABC
 
+
 class BaseJob(ABC):
     def __init__(self, *, verbose=False) -> None:
         self.verbose = verbose
diff --git a/aisploit/core/model.py b/aisploit/core/model.py
@@ -5,14 +5,17 @@
 from langchain_core.runnables import Runnable
 from langchain_core.embeddings import Embeddings
 
+
 class BaseLLM(Runnable[LanguageModelInput, str]):
     pass
 
+
 class BaseChatModel(Runnable[LanguageModelInput, BaseMessage]):
     @abstractmethod
     def supports_functions(self) -> bool:
         pass
 
+
 BaseModel = Union[BaseLLM, BaseChatModel]
 
-BaseEmbeddings = Embeddings
+BaseEmbeddings = Embeddings
diff --git a/aisploit/core/target.py b/aisploit/core/target.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 
+
 class BaseTarget(ABC):
     @abstractmethod
     def send_prompt(self, prompt: str) -> str:
-        pass
+        pass
diff --git a/aisploit/dataset/__init__.py b/aisploit/dataset/__init__.py
@@ -2,4 +2,4 @@
 
 __all__ = [
     "Dataset",
-]
+]
diff --git a/aisploit/dataset/dataset.py b/aisploit/dataset/dataset.py
@@ -1,3 +1,3 @@
-class Dataset():
+class Dataset:
     def __init__(self) -> None:
-        pass
+        pass
diff --git a/aisploit/demo/gandalf.py b/aisploit/demo/gandalf.py
@@ -7,8 +7,7 @@
 from langchain_core.callbacks.manager import CallbackManagerForLLMRun
 from langchain_core.messages import SystemMessage, HumanMessage
 
-from aisploit.core import BaseChatModel
-from aisploit.classifier import BaseClassifier, Score
+from aisploit.core import BaseChatModel, BaseClassifier, Score
 
 GANDALF_API_ENDPOINT = "https://gandalf.lakera.ai/api"
 

diff --git a/aisploit/embedding/__init__.py b/aisploit/embedding/__init__.py
@@ -2,6 +2,6 @@
 from .openai import OpenAIEmbeddings
 
 __all__ = [
-    "OllamaEmbeddings", 
+    "OllamaEmbeddings",
     "OpenAIEmbeddings",
 ]
diff --git a/aisploit/llm/__init__.py b/aisploit/llm/__init__.py
@@ -2,6 +2,6 @@
 from .chat_openai import ChatOpenAI
 
 __all__ = [
-    "ChatOllama", 
+    "ChatOllama",
     "ChatOpenAI",
 ]
diff --git a/aisploit/llm/chat_openai.py b/aisploit/llm/chat_openai.py
@@ -4,6 +4,7 @@
 
 from ..core import BaseChatModel
 
+
 class ChatOpenAI(LangchainChatOpenAI, BaseChatModel):
     """
     Wrapper class for interacting with the OpenAI API for chat-based models.

diff --git a/aisploit/redteam/job.py b/aisploit/redteam/job.py
@@ -1,12 +1,18 @@
 from typing import List
 
 from .bot import RedTeamingBot
-from ..core import BaseJob, BaseTarget, BaseCallbackHandler, CallbackManager
-from ..classifier import BaseClassifier
+from ..core import (
+    BaseClassifier,
+    BaseJob,
+    BaseTarget,
+    BaseCallbackHandler,
+    CallbackManager,
+)
+
 
 class RedTeamJob(BaseJob):
     def __init__(
-        self, 
+        self,
         *,
         bot: RedTeamingBot,
         target: BaseTarget,
@@ -16,7 +22,7 @@ def __init__(
         verbose=False,
     ) -> None:
         super().__init__(verbose=verbose)
-        
+
         self._bot = bot
         self._target = target
         self._classifier = classifier
@@ -29,24 +35,24 @@ def __init__(
     @property
     def conversation_id(self):
         return self._bot.conversation_id
-    
+
     def execute(self, max_attempt=5, clear_history=True):
         if clear_history:
             self._bot.clear_history()
 
         current_prompt = self._initial_prompt
-        
-        for attempt in range(1, max_attempt+1):
+
+        for attempt in range(1, max_attempt + 1):
             current_prompt = self._bot.invoke(current_prompt)
-            
+
             self._callback_manager.on_redteam_attempt(attempt, current_prompt)
-            
+
             response = self._target.send_prompt(current_prompt)
             score = self._classifier.score_text(text=response)
 
             self._callback_manager.on_redteam_attempt_response(attempt, response)
-
+
+            current_prompt = response
+
             if score.score_value:
                 return score
-
-
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ @@
     from ...core import BaseClassifier, Score
     class PipelinePromptInjectionIdentifier(BaseClassifier):
         def __init__(
             self,
@@ Expand Down @@