-
Notifications
You must be signed in to change notification settings - Fork 2
/
n-gram.py
135 lines (107 loc) · 3.69 KB
/
n-gram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import re
import os
import numpy
os.system('cls')
textFile = []
# Reads all files within the ./corpus directory and its subfolders
for path, dirs, files in os.walk('./corpus'):
for file in files:
read_f = open(os.path.join(path, file), 'r')
textFile.append(read_f.read().lower())
def reverseWords(string):
if string:
words = string.split(' ')
rev = ' '.join(reversed(words))
return rev
def findCountAtStartOfSentence(word, corpus):
count = 0
for sentence in corpus:
if sentence.startswith(word):
count += 1
return count
# Splits file into sentences
def tokenize(file):
sentences = re.split(r"[.?!]\s+", file)
return sentences
# Splits a sentence into words
def splitToWords(sentence):
temp = sentence.split()
return temp
sentences = []
for files in textFile:
temp = (tokenize(files))
for sentence in temp:
sentences.append(sentence)
words = []
for sentence in sentences:
for x in splitToWords(sentence):
words.append(x)
totalWordsCount = len(words)
uniqueWords = set(words)
totalUniqueWordsCount = len(uniqueWords)
def getNGramString(string, ngramNumber):
string = string.lower()
string = splitToWords(string)
if ngramNumber == 1:
return string
else:
totalGrams = [string[0]]
for index, word in enumerate(string):
i = index
toFindProb = ''
if index > 0 and ngramNumber > 1:
for j in range(ngramNumber):
if i >= 0:
toFindProb = toFindProb + string[i]
toFindProb += ' '
i -= 1
if toFindProb:
realWord = toFindProb.strip()
totalGrams.append(reverseWords(realWord))
return totalGrams
def findCount(word, corpus):
count = 0
for sentence in corpus:
expression = r"\b" + re.escape(word) + r"\b"
temp = re.findall(expression, sentence)
count += len(temp)
return count
# returns probability of a sentence using bi-gram
def SentenceProb(sentence, corpus, totalWordsCount):
ngramString = getNGramString(sentence, 2)
sentence = sentence.lower().split()
totalProbability = []
probOfFirstWord = findCountAtStartOfSentence(
sentence[0], corpus)/totalWordsCount
totalProbability.append(probOfFirstWord)
for i in range(1, len(sentence)):
temp = findCount(ngramString[i], corpus)
if temp:
result = temp/findCount(sentence[i], corpus)
totalProbability.append(result)
else:
totalProbability.append(0)
result = numpy.prod(totalProbability)
return result
# returns smooth probability of a sentence using bi-gram
def SmoothSentenceProb(sentence, corpus, totalWordsCount, uniqueWords):
ngramString = getNGramString(sentence, 2)
sentence = sentence.lower().split()
totalProbability = []
probOfFirstWord = findCountAtStartOfSentence(
sentence[0], corpus)+1/totalWordsCount+uniqueWords
totalProbability.append(probOfFirstWord)
for i in range(1, len(sentence)):
temp = (findCount(ngramString[i], corpus)+1)
result = temp/(findCount(sentence[i], corpus)+uniqueWords)
totalProbability.append(result)
result = numpy.prod(totalProbability)
return result
# Calculates perplexity of the sentence
def perplexity(probability, totalWordsCount):
perplexity = 1/probability
perplexity = pow(perplexity, totalWordsCount)
return perplexity
probability = SmoothSentenceProb("The batman was a hit",
sentences, totalWordsCount, totalUniqueWordsCount)
print("The probability is: ", probability)