Skip to content

Commit

Permalink
Misc
Browse files Browse the repository at this point in the history
  • Loading branch information
hupe1980 committed Apr 24, 2024
1 parent 4a16bef commit 97e789f
Show file tree
Hide file tree
Showing 8 changed files with 428 additions and 248 deletions.
25 changes: 12 additions & 13 deletions aisploit/classifiers/self_similarity.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,22 @@
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import torch
import torch.utils

from ..core import BaseTextClassifier, Score
from ..core import BaseEmbeddings, BaseTextClassifier, Score
from ..embeddings import HuggingFaceEmbeddings


@dataclass
@dataclass(kw_only=True)
class SelfSimilarityClassifier(BaseTextClassifier[Dict[str, Any]]):
"""A text classifier based on self-similarity using cosine similarity scores."""

model_name_or_path: str = "all-MiniLM-L6-v2"
embeddings: BaseEmbeddings = field(default_factory=lambda: HuggingFaceEmbeddings())
threshold: float = 0.7
aggregation: Literal["mean", "min"] = "mean"
tags: List[str] = field(default_factory=lambda: ["hallucination"], init=False)

def __post_init__(self) -> None:
"""Initialize the SentenceTransformer model."""
self._model = SentenceTransformer(self.model_name_or_path)

def score(self, input: str, references: List[str] | None = None) -> Score[Dict[str, Any]]:
"""Score the input text based on its self-similarity to reference texts.
Expand All @@ -36,15 +33,17 @@ def score(self, input: str, references: List[str] | None = None) -> Score[Dict[s
if not references or not len(references) >= 1:
raise ValueError("The number of references must be at least 1.")

input_embeddings = self._model.encode(input, convert_to_tensor=True)
references_embeddings = self._model.encode(references, convert_to_tensor=True)
input_embeddings = torch.tensor(self.embeddings.embed_query(input))

references_embeddings = torch.tensor(self.embeddings.embed_documents(references))

cos_scores = cos_sim(input_embeddings, references_embeddings)[0]
# Calculate cosine similarity
cos_scores = torch.nn.functional.cosine_similarity(input_embeddings.unsqueeze(0), references_embeddings, dim=1)

score = cos_scores.mean() if self.aggregation == "mean" else cos_scores.min()

return Score[Dict[str, Any]](
flagged=(score < self.threshold).item(),
flagged=bool(score < self.threshold),
value={
"aggregated_score": score.item(),
"scores": cos_scores.tolist(),
Expand Down
2 changes: 2 additions & 0 deletions aisploit/embeddings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from .bedrock import BedrockEmbeddings
from .google import GoogleGenerativeAIEmbeddings
from .huggingface import HuggingFaceEmbeddings
from .ollama import OllamaEmbeddings
from .openai import OpenAIEmbeddings

__all__ = [
"BedrockEmbeddings",
"GoogleGenerativeAIEmbeddings",
"HuggingFaceEmbeddings",
"OllamaEmbeddings",
"OpenAIEmbeddings",
]
18 changes: 18 additions & 0 deletions aisploit/embeddings/huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from langchain_community.embeddings import (
HuggingFaceEmbeddings as LangchainHuggingFaceEmbeddings,
)

from ..core import BaseEmbeddings


class HuggingFaceEmbeddings(LangchainHuggingFaceEmbeddings, BaseEmbeddings):
def __init__(
self,
*,
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
**kwargs,
) -> None:
super().__init__(
model_name=model_name,
**kwargs,
)
11 changes: 7 additions & 4 deletions aisploit/scanner/plugins/self_similarity.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,30 @@
from dataclasses import dataclass, field
from typing import List, Sequence
from typing import List, Literal, Sequence

from ..plugin import Plugin
from ..report import Issue, IssueCategory
from ...classifiers import SelfSimilarityClassifier
from ...converters import NoOpConverter
from ...core import BaseConverter, BaseTarget
from ...core import BaseConverter, BaseEmbeddings, BaseTarget
from ...embeddings import HuggingFaceEmbeddings
from ...sender import SenderJob


@dataclass(kw_only=True)
class SelfSimilarityPlugin(Plugin):
questions: List[str] # TODO dataset
num_samples: int = 3
model_name_or_path: str = "all-MiniLM-L6-v2"
embeddings: BaseEmbeddings = field(default_factory=lambda: HuggingFaceEmbeddings())
threshold: float = 0.7
aggregation: Literal['mean', 'min'] = "mean"
converters: List[BaseConverter] = field(default_factory=lambda: [NoOpConverter()])
name: str = field(default="self_similarity", init=False)

def __post_init__(self) -> None:
self._classifier = SelfSimilarityClassifier(
model_name_or_path=self.model_name_or_path,
embeddings=self.embeddings,
threshold=self.threshold,
aggregation=self.aggregation,
)

def run(self, *, run_id: str, target: BaseTarget) -> Sequence[Issue]:
Expand Down
4 changes: 3 additions & 1 deletion docs/scanner.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@
" domain=\"cxd47vgx2z2qyzr637trlgzogfm6ayyn.oastify.com\"\n",
" ),\n",
" ],\n",
")"
")\n",
"\n",
"# job.execute()"
]
},
{
Expand Down
11 changes: 9 additions & 2 deletions examples/classifier.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,23 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([384]) torch.Size([1, 384])\n"
]
},
{
"data": {
"text/plain": [
"Score(flagged=True, value={'aggregated_score': 0.6721476912498474, 'scores': [0.6721476912498474]}, description='Returns True if the aggregated cosine similarity score is less than the threshold', explanation='The aggregated cosine similarity score for the input is 0.6721476912498474')"
]
},
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
Loading

0 comments on commit 97e789f

Please sign in to comment.