-
Notifications
You must be signed in to change notification settings - Fork 1
/
plagiarism-detection.py
56 lines (50 loc) · 3.04 KB
/
plagiarism-detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import argparse
import re
import json
from ntuple import GeneralWordListGenerator, NTupleAlgorithm
def main(synonymsFile, file1, file2, n):
""" The main method for the Plagiarism Detection program. The instructions were vague about \
what to do when there were multiple lines in the input files, I've assumed here that the confidence \
value should reflect the entire file. If there are multiple lines, they'll be treated the same as one \
line. """
# Read in the contents of the synonyms file into a list of tuples, where each
# tuple contains words that are synonyms of each other.
synonyms = GeneralWordListGenerator.read_file_to_tuple_list(args.synonymsFile)
# Construct our word list generator, which will take strings representing text
# and return a generalized form of the text, where each word with a synonym from
# the synonyms file is represented as a tuple of its synonyms. See the `generate_word_list`
# method in the `GeneralWordListGenerator` for more details.
word_list_generator = GeneralWordListGenerator(synonyms)
# Read in all the lines from the two input files. Here, we ignore lines. Each file will be read
# as one long string.
file1_string = file1.read()
file2_string = file2.read()
# Generate generalized lists of words for the lines in the two files.
file1_general_word_list = word_list_generator.generate_word_list(file1_string)
file2_general_word_list = word_list_generator.generate_word_list(file2_string)
# Make sure N is less than the number of words in the word lists. If it isn't,
# it will return -1.
if n > len(file1_general_word_list):
return -1
# Initialize our N-tuple comparison algorithm, passing it the size N for the tuples.
n_tuple_alg = NTupleAlgorithm(n)
# If the lists of words are the same length, run the N-tuple comparison algorithm and
# return the confidence value. Assumes the lists are the same length. If they aren't,
# it will return -1.
if len(file1_general_word_list) == len(file2_general_word_list):
return n_tuple_alg.compare(file1_general_word_list, file2_general_word_list)
else:
return -1
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Detect plagiarism in two input files \
using an N-tuple comparison algorithm fed by a list of synonyms.')
parser.add_argument('synonymsFile', metavar='SYNONYMS', type=argparse.FileType('r'),
help='A path to a file containing a list of synonyms')
parser.add_argument('file1', metavar='FILE_1', type=argparse.FileType('r'),
help='A path to the first input file')
parser.add_argument('file2', metavar='FILE_2', type=argparse.FileType('r'),
help='A path to the second input file')
parser.add_argument('-n', metavar='N', type=int, default=3,
help='The tuple size for the N-tuple comparison algorithm')
args=parser.parse_args()
print("{0}%".format(main(args.synonymsFile, args.file1, args.file2, args.n) * 100))