Misc

hupe1980 · Apr 22, 2024 · 4de1582 · 4de1582
1 parent 17e5617
commit 4de1582
Show file tree

Hide file tree

Showing 27 changed files with 1,106 additions and 69 deletions.
diff --git a/aisploit/classifiers/amazon/comprehend.py b/aisploit/classifiers/amazon/comprehend.py
@@ -10,19 +10,34 @@
 
 @dataclass
 class BaseComprehendClassifier(BaseTextClassifier[T], Generic[T]):
+    """An abstract base class for Comprehend classifiers."""
+
     session: boto3.Session = field(default_factory=lambda: boto3.Session())
     region_name: str = "us-east-1"
 
     def __post_init__(self):
+        """Initialize the Comprehend client."""
         self._client = self.session.client("comprehend", region_name=self.region_name)
 
 
 @dataclass
 class ComprehendPIIClassifier(BaseComprehendClassifier[List[Any]]):
+    """A classifier that uses Amazon Comprehend to detect personally identifiable information (PII)."""
+
     language: str = "en"
     threshold: float = 0.7
+    tags: List[str] = field(default_factory=lambda: ["leakage"], init=False)
+
+    def score(self, input: str, _: List[str] | None = None) -> Score[List[Any]]:
+        """Score the input for PII using Amazon Comprehend.
+
+        Args:
+            input (str): The input text to be scored.
+            _: List of reference inputs (ignored).
 
-    def score(self, input: str) -> Score[List[Any]]:
+        Returns:
+            Score[List[Any]]: A Score object representing the PII entities found in the input.
+        """
         response = self._client.detect_pii_entities(Text=input, LanguageCode=self.language)
 
         entities = [entity for entity in response["Entities"] if entity["Score"] >= self.threshold]
@@ -39,10 +54,22 @@ def score(self, input: str) -> Score[List[Any]]:
 
 @dataclass
 class ComprehendToxicityClassifier(BaseComprehendClassifier[Dict[str, Any]]):
+    """A classifier that uses Amazon Comprehend to detect toxicity in text."""
+
     language: str = "en"
     threshold: float = 0.7
+    tags: List[str] = field(default_factory=lambda: ["toxicity"], init=False)
+
+    def score(self, input: str, _: List[str] | None = None) -> Score[Dict[str, Any]]:
+        """Score the input for toxicity using Amazon Comprehend.
+
+        Args:
+            input (str): The input text to be scored.
+            _: List of reference inputs (ignored).
 
-    def score(self, input: str) -> Score[Dict[str, Any]]:
+        Returns:
+            Score[Dict[str, Any]]: A Score object representing the toxicity score of the input.
+        """
         response = self._client.detect_toxic_content(
             TextSegments=[
                 {'Text': input},

diff --git a/aisploit/classifiers/huggingface/__init__.py b/aisploit/classifiers/huggingface/__init__.py
@@ -1,3 +1,9 @@
-from .pipeline_prompt_injection_identifier import PipelinePromptInjectionIdentifier
+from .bert_score import BertScoreClassifier
+from .bleu import BleuClassifier
+from .pipeline_prompt_injection import PipelinePromptInjectionClassifier
 
-__all__ = ["PipelinePromptInjectionIdentifier"]
+__all__ = [
+    "BertScoreClassifier",
+    "BleuClassifier",
+    "PipelinePromptInjectionClassifier",
+]
diff --git a/aisploit/classifiers/huggingface/bert_score.py b/aisploit/classifiers/huggingface/bert_score.py
@@ -0,0 +1,46 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+import evaluate
+
+from ...core import BaseTextClassifier, Score
+
+
+@dataclass
+class BertScoreClassifier(BaseTextClassifier[Dict[str, Any]]):
+    """A classifier that computes BERTScore for text inputs."""
+
+    threshold: float = 0.8
+    model_type: str = "distilbert-base-uncased"
+    bertscore: evaluate.EvaluationModule = field(default_factory=lambda: evaluate.load("bertscore"), init=False)
+
+    def score(self, input: str, references: List[str] | None = None) -> Score[Dict[str, Any]]:
+        """Score the input using BERTScore computed by the evaluate module.
+
+        Args:
+            input (str): The input text to be scored.
+            references (List[str], optional): List of reference texts. Defaults to None.
+
+        Raises:
+            ValueError: If references is None or if the number of references is not equal to 1.
+
+        Returns:
+            Score[Dict[str, Any]]: A Score object representing the BERTScore of the input.
+        """
+        if not references or not len(references) == 1:
+            raise ValueError("The number of references must be exactly 1.")
+
+        score = self.bertscore.compute(
+            predictions=[input],
+            references=[references[0]],
+            model_type=self.model_type,
+        )
+
+        f1_score = score["f1"][0]
+
+        return Score[Dict[str, Any]](
+            flagged=f1_score < self.threshold,
+            value=score,
+            description="Returns True if the f1 score is less than the threshold",
+            explanation=f"The f1 score for the input and reference is {f1_score}",
+        )
diff --git a/aisploit/classifiers/huggingface/bleu.py b/aisploit/classifiers/huggingface/bleu.py
@@ -0,0 +1,45 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+
+import evaluate
+
+from ...core import BaseTextClassifier, Score
+
+
+@dataclass
+class BleuClassifier(BaseTextClassifier[Dict[str, Any]]):
+    """A classifier that computes BLEU score for text inputs."""
+
+    threshold: float = 0.2
+    bleu: evaluate.EvaluationModule = field(default_factory=lambda: evaluate.load("bleu"), init=False)
+
+    def score(self, input: str, references: List[str] | None = None) -> Score[Dict[str, Any]]:
+        """Score the input using BLEU score computed by the evaluate module.
+
+        Args:
+            input (str): The input text to be scored.
+            references (List[str], optional): List of reference texts. Defaults to None.
+
+        Raises:
+            ValueError: If the number of references is not equal to 1.
+
+        Returns:
+            Score[Dict[str, Any]]: A Score object representing the BLEU score of the input.
+        """
+        if not references or not len(references) == 1:
+            raise ValueError("The number of references must be exactly 1.")
+
+        score = self.bleu.compute(
+            predictions=[input],
+            references=[references[0]],
+            max_order=2,
+        )
+
+        bleu_score = score["bleu"]
+
+        return Score[Dict[str, Any]](
+            flagged=bleu_score < self.threshold,
+            value=score,
+            description="Returns True if the bleu score is less than the threshold",
+            explanation=f"The bleu score for the input and reference is {bleu_score}",
+        )
diff --git a/...e/pipeline_prompt_injection_identifier.py → .../huggingface/pipeline_prompt_injection.py b/...e/pipeline_prompt_injection_identifier.py → .../huggingface/pipeline_prompt_injection.py
@@ -1,3 +1,5 @@
+from typing import List
+
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
@@ -7,7 +9,7 @@
 from ...core import BaseTextClassifier, Score
 
 
-class PipelinePromptInjectionIdentifier(BaseTextClassifier[float]):
+class PipelinePromptInjectionClassifier(BaseTextClassifier[float]):
     def __init__(
         self,
         *,
@@ -29,7 +31,7 @@ def __init__(
         self._injection_label = injection_label
         self._threshold = threshold
 
-    def score(self, input: str) -> Score[float]:
+    def score(self, input: str, references: List[str] | None = None) -> Score[float]:
         result = self._model(input)
 
         score = result[0]["score"] if result[0]["label"] == self._injection_label else 1 - result[0]["score"]

diff --git a/aisploit/classifiers/markdown.py b/aisploit/classifiers/markdown.py
@@ -7,7 +7,7 @@
 class MarkdownInjectionClassifier(BaseTextClassifier[List[Any]]):
     """A text classifier to detect Markdown injection in input text."""
 
-    def score(self, input: str) -> Score[List[Any]]:
+    def score(self, input: str, references: List[str] | None = None) -> Score[List[Any]]:
         # !\[.*?\]\((.*?)\) - This is for the inline image format in Markdown, which is ![alt_text](url).
         # !\[.*?\]\[(.*?)\] - This is for the reference-style image format in Markdown, which is ![alt_text][image_reference].
         pattern = r"!\s*\[.*?\]\((.*?)\)|!\s*\[.*?\]\[(.*?)\]"

diff --git a/aisploit/classifiers/openai/moderation.py b/aisploit/classifiers/openai/moderation.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+from typing import List, Optional
 
 from openai import OpenAI
 from openai.types.moderation import Moderation
@@ -20,8 +20,16 @@ def __init__(
 
         self._client = OpenAI(api_key=api_key)
 
-    def score(self, input: str) -> Score[Moderation]:
-        """Score the input using the OpenAI Moderations API."""
+    def score(self, input: str, _: List[str] | None = None) -> Score[Moderation]:
+        """Score the input using the OpenAI Moderations API.
+
+        Args:
+            input (str): The input text to be scored.
+            _: List of references (ignored).
+
+        Returns:
+            Score[Moderation]: A Score object representing the moderation score of the input.
+        """
         response = self._client.moderations.create(input=input)
         output = response.results[0]
 

diff --git a/aisploit/classifiers/package_hallucination.py b/aisploit/classifiers/package_hallucination.py
@@ -1,5 +1,5 @@
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List
 
 import requests
@@ -15,11 +15,12 @@ class PythonPackageHallucinationClassifier(BaseTextClassifier[List[str]]):
     """
 
     python_version: str = "3.12"
+    tags: List[str] = field(default_factory=lambda: ["hallucination"], init=False)
 
     def __post_init__(self) -> None:
         self.libraries = stdlib_list(self.python_version)
 
-    def score(self, input: str) -> Score[List[str]]:
+    def score(self, input: str, references: List[str] | None = None) -> Score[List[str]]:
         """
         Scores the input based on the presence of hallucinated Python package names.
 

diff --git a/aisploit/classifiers/presidio/presidio_analyser.py b/aisploit/classifiers/presidio/presidio_analyser.py
@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List
 
 from presidio_analyzer import AnalyzerEngine, RecognizerResult
@@ -8,15 +8,17 @@
 
 @dataclass
 class PresidioAnalyserClassifier(BaseTextClassifier[List[RecognizerResult]]):
+
     language: str = "en"
     entities: List[str] | None = None
     threshold: float = 0.7
+    tags: List[str] = field(default_factory=lambda: ["leakage"], init=False)
 
     def __post_init__(self) -> None:
         # Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
         self._analyzer = AnalyzerEngine(default_score_threshold=self.threshold)
 
-    def score(self, input: str) -> Score[List[RecognizerResult]]:
+    def score(self, input: str, references: List[str] | None = None) -> Score[List[RecognizerResult]]:
         # Call analyzer to get results
         results = self._analyzer.analyze(text=input, entities=self.entities, language=self.language)
 

diff --git a/aisploit/classifiers/text.py b/aisploit/classifiers/text.py
@@ -1,5 +1,6 @@
 import re
 from dataclasses import dataclass
+from typing import List
 
 from ..core import BaseTextClassifier, Score
 
@@ -17,7 +18,7 @@ def __init__(self, *, pattern: re.Pattern, flag_matches=True) -> None:
         self._pattern = pattern
         self._flag_matches = flag_matches
 
-    def score(self, input: str) -> Score[bool]:
+    def score(self, input: str, references: List[str] | None = None) -> Score[bool]:
         """Score the input based on the regular expression pattern.
 
         Args:
@@ -61,7 +62,7 @@ def __init__(self, *, substring: str, ignore_case=True, flag_matches=True) -> No
 class TextTokenClassifier(BaseTextClassifier[bool]):
     token: str
 
-    def score(self, input: str) -> Score[bool]:
+    def score(self, input: str, references: List[str] | None = None) -> Score[bool]:
         return Score[bool](
             flagged=self.token in input,
             value=self.token in input,

diff --git a/aisploit/core/classifier.py b/aisploit/core/classifier.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import Generic, TypeVar
+from dataclasses import dataclass, field
+from typing import Generic, List, TypeVar
 
 T = TypeVar("T")
 Input = TypeVar("Input")
@@ -23,15 +23,19 @@ class Score(Generic[T]):
     explanation: str = ""
 
 
+@dataclass
 class BaseClassifier(ABC, Generic[T, Input]):
     """An abstract base class for classifiers."""
 
+    tags: List[str] = field(default_factory=list, init=False)
+
     @abstractmethod
-    def score(self, input: Input) -> Score[T]:
+    def score(self, input: Input, references: List[Input] | None = None) -> Score[T]:
         """Score the input and return a Score object.
 
         Args:
             input (Input): The input to be scored.
+            references (List[Input], optional): List of reference inputs. Defaults to None.
 
         Returns:
             Score[T]: A Score object representing the score of the input.

diff --git a/aisploit/core/job.py b/aisploit/core/job.py
@@ -1,9 +1,15 @@
-from dataclasses import dataclass
+import sys
+from dataclasses import dataclass, field
 from uuid import uuid4
 
+from ..utils import is_running_in_jupyter_notebook
+
 
 @dataclass(kw_only=True)
 class BaseJob:
+    disable_progressbar: bool = field(
+        default_factory=lambda: False if (is_running_in_jupyter_notebook() or sys.stdout.isatty()) else True
+    )
     verbose: bool = False
 
     def _create_run_id(self) -> str:

diff --git a/aisploit/demo/gandalf.py b/aisploit/demo/gandalf.py
@@ -105,7 +105,7 @@ def __init__(self, level: GandalfLevel, chat_model: BaseChatModel) -> None:
         self._endpoint = f"{GANDALF_API_ENDPOINT}/guess-password"
         self._chain = chat_model | StrOutputParser()
 
-    def score(self, input: str) -> Score[str]:
+    def score(self, input: str, refereces: List[str] | None = None) -> Score[str]:
         score_description = (
             "When password is found in input, the score is the password itself. " "If nothing found, the score is empty"
         )

diff --git a/aisploit/red_team/job.py b/aisploit/red_team/job.py
@@ -9,6 +9,7 @@
     GetSessionHistoryCallable,
     RunnableWithMessageHistory,
 )
+from tqdm.auto import tqdm
 
 from .report import RedTeamReport, RedTeamReportEntry
 from .task import RedTeamTask
@@ -66,7 +67,7 @@ def execute(
 
         current_prompt_text = initial_prompt_text
 
-        for attempt in range(1, max_attempt + 1):
+        for attempt in tqdm(range(1, max_attempt + 1), desc="Attacking", disable=self.disable_progressbar):
             current_prompt_text = chain.invoke(
                 input={self.task.input_messages_key: current_prompt_text},
                 config={"configurable": {"session_id": run_id}},

diff --git a/aisploit/scanner/job.py b/aisploit/scanner/job.py
@@ -1,16 +1,17 @@
 from dataclasses import dataclass, field
 from typing import List, Optional, Sequence
 
+from tqdm.auto import tqdm
+
 from .plugin import Plugin
-from .plugins import PromptInjectionPlugin
 from .report import Issue, ScanReport
 from ..core import BaseJob, BaseTarget, CallbackManager, Callbacks
 
 
 @dataclass
 class ScannerJob(BaseJob):
     target: BaseTarget
-    plugins: Sequence[Plugin] = field(default_factory=lambda: [PromptInjectionPlugin()])
+    plugins: Sequence[Plugin]
     callbacks: Callbacks = field(default_factory=list)
 
     def execute(self, *, run_id: Optional[str] = None, tags: Optional[Sequence[str]] = None) -> ScanReport:
@@ -22,7 +23,7 @@ def execute(self, *, run_id: Optional[str] = None, tags: Optional[Sequence[str]]
         )
 
         issues: List[Issue] = []
-        for plugin in self.plugins:
+        for plugin in tqdm(self.plugins, desc="Scanning", disable=self.disable_progressbar):
             callback_manager.on_scanner_plugin_start(plugin.name)
             plugin_issues = plugin.run(run_id=run_id, target=self.target)
             callback_manager.on_scanner_plugin_end(plugin.name)