forked from lachlanpage/Markov-Chain-Sentence-Generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilarity_check.py
97 lines (70 loc) · 3.66 KB
/
similarity_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import difflib
from typing import Tuple
from log_config import configure_logger
logger = configure_logger(__name__)
# Use a sliding window approach along with a sequence matching
# technique to check if the generated output is similar to any part of the input
# training_corpus_filename (the entire corpus). Use Python's difflib.SequenceMatcher.
def get_corpus_string(corpus_as_string):
"""
Return the input string without any modification.
Useful for importing the corpus from another Python file that already converted it to a string.
Args:
corpus_as_string (str): A string representing a training_corpus_filename corpus.
Returns:
str: The same input string.
"""
return corpus_as_string
def check_similarity(input_text, output_text, window_size, similarity_threshold) -> Tuple[float, float, bool, list]:
"""
Check if the generated output is similar to any part of the input training_corpus_filename (the entire novel).
One possible approach is using Python's difflib.SequenceMatcher:
:param input_text:
:param output_text:
:param window_size:
:param similarity_threshold:
:return: Tuple of highest similarity score, average similarity score, flag indicating output is too similar,
and a list of overly similar phrases.
:rtype: Tuple[float, float, bool, list]
"""
# Split texts into words
input_words = input_text.split()
output_words = output_text.split()
# Check if output training_corpus_filename is shorter than the window size
if len(output_words) < window_size:
window_size = len(output_words)
# Create a SequenceMatcher instance
matcher = difflib.SequenceMatcher(None, output_words)
# Save the highest similarity score
# Initialize to -1 to ensure that the first comparison is always done
highest_similarity_score = -1
# Initialize variables to track the sum and count of similarity scores
sum_similarity_score = 0
count_similarity_score = 0
# Initialize an empty list to store phrases
overly_similar_phrases = []
# Iterate through input training_corpus_filename using a sliding window of size window_size
for i in range(len(input_words) - window_size + 1):
current_window = input_words[i:i + window_size]
matcher.set_seq2(current_window)
similarity_score = matcher.ratio()
# Update the highest similarity score
if similarity_score > highest_similarity_score:
highest_similarity_score = similarity_score
# Update the sum and count of the highest similarity scores
sum_similarity_score += highest_similarity_score
count_similarity_score += 1
if similarity_score >= similarity_threshold:
# Convert the current window to a string
current_phrase = ' '.join(current_window)
# Add the current phrase to the list of overly similar phrases
overly_similar_phrases.append(current_phrase)
# now we have a list of phrases that are too similar to the original training_corpus_filename
# Compute the average similarity score
average_similarity_score = sum_similarity_score / count_similarity_score
# If the average similarity score is greater than the similarity_threshold, the output is considered too similar
if average_similarity_score >= similarity_threshold:
return highest_similarity_score, average_similarity_score, True, overly_similar_phrases
# Return the highest and average similarity scores, and a flag indicating output is not too similar to the
# original training_corpus_filename.
return highest_similarity_score, average_similarity_score, False, overly_similar_phrases