Add presidio classifier

hupe1980 · Apr 21, 2024 · 0902622 · 0902622
1 parent 6d258f0
commit 0902622
Show file tree

Hide file tree

Showing 5 changed files with 674 additions and 73 deletions.
diff --git a/aisploit/classifiers/presidio/__init__.py b/aisploit/classifiers/presidio/__init__.py
@@ -0,0 +1,5 @@
+from .presidio_analyser import PresidioAnalyserClassifier
+
+__all__ = [
+    "PresidioAnalyserClassifier",
+]
diff --git a/aisploit/classifiers/presidio/presidio_analyser.py b/aisploit/classifiers/presidio/presidio_analyser.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass
+from typing import List
+
+from presidio_analyzer import AnalyzerEngine, RecognizerResult
+
+from ...core import BaseTextClassifier, Score
+
+
+@dataclass
+class PresidioAnalyserClassifier(BaseTextClassifier[List[RecognizerResult]]):
+    language: str = "en"
+    entities: List[str] | None = None
+    threshold: float = 0.7
+
+    def __post_init__(self) -> None:
+        # Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
+        self._analyzer = AnalyzerEngine(default_score_threshold=self.threshold)
+
+    def score(self, input: str) -> Score[List[RecognizerResult]]:
+        # Call analyzer to get results
+        results = self._analyzer.analyze(text=input, entities=self.entities, language=self.language)
+
+        return Score[List[RecognizerResult]](
+            flagged=len(results) > 0,
+            value=results,
+            description="Return True if entities are found in the input",
+            explanation=(
+                f"Found {len(results)} entities in input" if len(results) > 0 else "Did not find entities in input"
+            ),
+        )
diff --git a/examples/classifier.ipynb b/examples/classifier.ipynb
@@ -26,12 +26,41 @@
    "source": [
     "import textwrap\n",
     "from dotenv import load_dotenv\n",
+    "from aisploit.classifiers.presidio import PresidioAnalyserClassifier\n",
     "from aisploit.classifiers.huggingface import PipelinePromptInjectionIdentifier\n",
     "from aisploit.classifiers.openai import ModerationClassifier\n",
     "\n",
     "load_dotenv()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Presidio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Score(flagged=True, value=[type: PERSON, start: 11, end: 19, score: 0.85, type: PHONE_NUMBER, start: 43, end: 55, score: 0.75], description='Return True if entities are found in the input', explanation='Found 2 entities in input')"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classifier = PresidioAnalyserClassifier()\n",
+    "classifier.score(\"My name is John Doo and my phone number is 212-555-5555\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -81,7 +110,7 @@
     {
      "data": {
       "text/plain": [
-       "Score(flagged=True, value=Moderation(categories=Categories(harassment=True, harassment_threatening=True, hate=False, hate_threatening=False, self_harm=False, self_harm_instructions=False, self_harm_intent=False, sexual=False, sexual_minors=False, violence=True, violence_graphic=False, self-harm=False, sexual/minors=False, hate/threatening=False, violence/graphic=False, self-harm/intent=False, self-harm/instructions=False, harassment/threatening=True), category_scores=CategoryScores(harassment=0.45708343386650085, harassment_threatening=0.35233721137046814, hate=0.0006805699085816741, hate_threatening=4.209355392958969e-05, self_harm=4.824657480639871e-06, self_harm_instructions=3.298543660434916e-08, self_harm_intent=2.0117977328482084e-06, sexual=4.9561123887542635e-05, sexual_minors=1.9911001913897053e-07, violence=0.9988710284233093, violence_graphic=1.047616660798667e-05, self-harm=4.824657480639871e-06, sexual/minors=1.9911001913897053e-07, hate/threatening=4.209355392958969e-05, violence/graphic=1.047616660798667e-05, self-harm/intent=2.0117977328482084e-06, self-harm/instructions=3.298543660434916e-08, harassment/threatening=0.35233721137046814), flagged=True), description='Moderation score for the given input', explanation='Details about the moderation score')"
+       "Score(flagged=True, value=Moderation(categories=Categories(harassment=True, harassment_threatening=True, hate=False, hate_threatening=False, self_harm=False, self_harm_instructions=False, self_harm_intent=False, sexual=False, sexual_minors=False, violence=True, violence_graphic=False, self-harm=False, sexual/minors=False, hate/threatening=False, violence/graphic=False, self-harm/intent=False, self-harm/instructions=False, harassment/threatening=True), category_scores=CategoryScores(harassment=0.4573294222354889, harassment_threatening=0.35159170627593994, hate=0.0006792626227252185, hate_threatening=4.232471837894991e-05, self_harm=4.82136874779826e-06, self_harm_instructions=3.341407150969644e-08, self_harm_intent=2.0083894014533143e-06, sexual=4.86759927298408e-05, sexual_minors=1.9414277119267354e-07, violence=0.9988717436790466, violence_graphic=1.050253467838047e-05, self-harm=4.82136874779826e-06, sexual/minors=1.9414277119267354e-07, hate/threatening=4.232471837894991e-05, violence/graphic=1.050253467838047e-05, self-harm/intent=2.0083894014533143e-06, self-harm/instructions=3.341407150969644e-08, harassment/threatening=0.35159170627593994), flagged=True), description='Moderation score for the given input', explanation='Details about the moderation score')"
       ]
      },
      "execution_count": 3,