forked from patrickdw123/ParanoiDF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
redact.py
executable file
·409 lines (330 loc) · 12.7 KB
/
redact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
# ParanoiDF. A combination of several PDF analysis/manipulation tools to
# produce one of the most technically useful PDF analysis tools.
#
# Idea proposed by Julio Hernandez-Castro, University of Kent, UK.
# By Patrick Wragg
# University of Kent
# 21/07/2014
#
# With thanks to:
# Julio Hernandez-Castro, my supervisor.
# Jose Miguel Esparza for writing PeePDF (the basis of this tool).
# Didier Stevens for his "make-PDF" tools.
# Blake Hartstein for Jsunpack-n.
# Yusuke Shinyama for Pdf2txt.py (PDFMiner)
# Nacho Barrientos Arias for Pdfcrack.
# Kovid Goyal for Calibre (DRM removal).
# Jay Berkenbilt for QPDF.
#
# Copyright (C) 2014-2018 Patrick Wragg
#
# This file is part of ParanoiDF.
#
# ParanoiDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ParanoiDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ParanoiDF. If not, see <http://www.gnu.org/licenses/>.
#
# This was written by Patrick Wragg.
'''
This script calculates which words of a certain font/size will fit inside
a redaction box. It requires the user to search through the objects of the
PDF first using the Interactive Console to find the details about the font
size, font and size of redaction box. Tutorial.pdf has a guide on this.
If the user wishes, he/she can use the implemented grammar parser (Stanford
Parser) to obtain a parsing score and the script will sort the list for them.
'''
import sys
import re
import difflib
import os
import json
import PIL.Image
import PIL.ImageDraw
import PIL.ImageFont
import operator
def main(wordType, letterCase):
successList=[]
dirPath = os.path.dirname(os.path.abspath(sys.argv[0]))
redactAreaX,redactAreaY = get_redaction_box() #Get redaction box size.
fontName = get_font(dirPath) #Get font.
fontSize = get_font_size() #Get font size.
dictFile = open(dict_file_return(wordType, letterCase, dirPath)) #Input of dictionary.
#Remove newline character from end of each word in dictionary.
dictList = []
lines = dictFile.readlines()
for word in lines:
if not word == '':
dictList.append(remove_newline(word))
dictFile.close()
#Draw word on blank canvas.
#Calculate length in pixels from colour coordinates.
#If word fits, add to List=<successList>.
lengthOfDict = len(dictList)
count = 1
print 'Parsing words now...'
for word in dictList:
successWord = check_word(word, redactAreaX,
redactAreaY, fontName,
fontSize)
if not successWord == 0:
successList.append(successWord)
numberOfSuccesses = len(successList)
#Print progress percentage.
sys.stdout.write('\r{0:.0f}%'.format((float(count)/lengthOfDict)*100))
count+=1
if len(successList) > 0:
print ''
#If word cannot be grammar checked (country or name Etc.) print results and quit.
if not is_word(wordType):
total_matches(numberOfSuccesses, lengthOfDict)
output_matches(dirPath, successList)
results_message(dirPath)
else:
total_matches(numberOfSuccesses, lengthOfDict)
print ''
doGrammarParse = grammar_parse_query() #Ask user if they want to grammar parse.
if doGrammarParse:
#Start grammar parse.
print ''
sentance = raw_input('Enter sentance where $word is word: ')
file = open(dirPath + '/tempSentance.txt', 'w') #Sentance file for Stanford to parse.
for word in successList:
result = build_sentance(sentance, word)
file.write(result)
file.close()
scoreString = stanford_parser(dirPath) #Stanford parser execution.
scoreList = find_score(scoreString) #Find score in scoreString.
wordScoreDict = add_score_to_dict(scoreList, successList) #Put score and word in dict.
scoreList = sort_dict(wordScoreDict) #Sort dictonary.
total_matches(numberOfSuccesses, lengthOfDict) #Print total matches
counter = 1
numberOfResults = number_of_results_return() #Print N number of highest scoring words.
file = open(dirPath + '/results.txt','w+')
for score in scoreList:
file.write(' '.join(map(str, score)))
file.write('\n')
if counter == numberOfResults:
break
file.close()
counter+=1
results_message(dirPath)
#No need for these now.
os.remove(dirPath + '/tempScore.txt')
os.remove(dirPath + '/tempSentance.txt')
#End grammar parse.
else: #No grammar parse.
total_matches(numberOfSuccesses, lengthOfDict)
output_matches(dirPath, successList)
results_message(dirPath)
else:
print ''
print ''
print 'No matches. Are you sure you got the coordinates, font and font size correct?'
def grammar_parse_query(): #Ask user if they want to grammar parse.
while True:
grammarQuery = raw_input('Would you like to do a grammar check? y/n: ')
if grammarQuery.lower() == 'y':
return True
break
elif grammarQuery.lower() == 'n':
return False
break
def number_of_results_return(): #Ask user how many results they want.
while True:
try:
numberOfResults = int(input('Number of results? '))
break
except ValueError:
print 'Invalid number, try again.'
return numberOfResults
def remove_newline(word): #Remove newline from word.
wordLength = len(word)
wordLength = wordLength - 1
newWord = word[:wordLength]
return newWord
def sort_dict(wordScoreDict): #Sort dictonary and output to list 'scoreList'.
scoreList = sorted(wordScoreDict.iteritems(), key=operator.itemgetter(1))
return scoreList
def add_score_to_dict(scoreList, successList): #Put score and associated word in dictionary.
i = 0
wordScoreDict={}
for score in scoreList:
wordScoreDict[successList[i]] = score
i+=1
return wordScoreDict
def find_score(scoreString): #Find score in scoreString.
scoreList=[]
regex = ur'\bwith score -[0-9]{1,}.[0-9]{1,}\b'
result = re.findall(regex, scoreString)
for score in result:
scoreList.append(score[12:])
return scoreList
def stanford_parser(dirPath): #Stanford parser execution.
os.system(dirPath + '/stanfordParser/' + './' + 'lexparser.sh ' + dirPath + '/tempSentance.txt > ' + dirPath + '/tempScore.txt')
file = open(dirPath + '/tempScore.txt', 'r')
scoreString = file.read()
file.close()
return scoreString
def is_word(wordType): #If not w, then grammar check not needed.
if 'w' in wordType:
return True
else:
return False
def dict_file_return(wordType, letterCase, dirPath): #Input of dictionary.
if wordType == 'f':
if letterCase == 'u':
dictFile = dirPath + '/redactDict/upperCase/first.txt'
elif letterCase == 'c':
dictFile = dirPath + '/redactDict/capitalised/first.txt'
else:
dictFile = dirPath + '/redactDict/lowerCase/first.txt'
elif wordType == 's':
if letterCase == 'u':
dictFile = dirPath + '/redactDict/upperCase/surname.txt'
elif letterCase == 'c':
dictFile = dirPath + '/redactDict/capitalised/surname.txt'
else:
dictFile = dirPath + '/redactDict/lowerCase/surname.txt'
elif wordType == 'c':
if letterCase == 'u':
dictFile = dirPath + '/redactDict/upperCase/country.txt'
elif letterCase == 'c':
dictFile = dirPath + '/redactDict/capitalised/country.txt'
else:
dictFile = dirPath + '/redactDict/lowerCase/country.txt'
elif wordType == 'w':
if letterCase == 'u':
dictFile = dirPath + '/redactDict/upperCase/words.txt'
elif letterCase == 'c':
dictFile = dirPath + '/redactDict/capitalised/words.txt'
else:
dictFile = dirPath + '/redactDict/lowerCase/words.txt'
return dictFile
def check_dict_files(filepath): #Check if dictionary file exists.
try:
file = open(filepath)
file.close()
return True
except IOError:
print 'Dictionary file not found, please check source repository.'
return False
def build_sentance(sentance, word): #Build sentance.
while True:
if '$word' in sentance:
break
else:
print '$word not found, try again. Example: \'The $word crossed the road.\''
index = sentance.index('$word')
startSentance = sentance[:index]
index = index + 5 #remove '$word'
endSentance = sentance[index:] + '. .'
builtSentance = startSentance + word + endSentance
return builtSentance
#Draw word on blank canvas.
#Calculate length in pixels from colour coordinates.
#If word fits, add to Array=<successList>.
def check_word(word, redactAreaX,
redactAreaY, fontName,
fontSize):
imageDimensionX = redactAreaX + 10
imageDimensionY = redactAreaY + 5
img = Image.new('RGB', (imageDimensionX, imageDimensionY))
d = ImageDraw.Draw(img)
f = ImageFont.truetype(fontName, fontSize)
d.text((0, 0), word, fill=(255,0,0), font=f)
img = img.convert('P')
xAndyCoord = []
for x in range(img.size[1]): #Get coordinates.
for y in range(img.size[0]):
pixCol = img.getpixel((y,x))
if pixCol == 16 or pixCol == 15 or \
pixCol == 14 or pixCol == 13 or \
pixCol == 12 or pixCol == 11 or \
pixCol == 10:
xAndyCoord.append((y,x))
#Convert list to string.
str1 = ''.join(str(e) for e in xAndyCoord)
#Strip X and Y value into seperate INT variable.
xcoord = [int(element.split(",")[0].rstrip().lstrip()) for element in str1[1:-1].split(")(")]
ycoord = [int(element.split(",")[1].rstrip().lstrip()) for element in str1[1:-1].split(")(")]
#X and Y maximum length for word.
maxValueX = max(xcoord)
maxValueY = max(ycoord)
minValueX = min(xcoord)
minValueY = min(ycoord)
xLength = maxValueX - minValueX
yLength = maxValueY - minValueY
#Maximum/minimum threshold for word.
maxX = redactAreaX# + 1
minX = redactAreaX - 3
maxY = redactAreaY# + 1
minY = redactAreaY - 10 #Big because a word might be all low letters such as "rear".
#If X and Y dimension falls within threshold.
if xLength <= maxX and xLength >= minX and yLength <= maxY and yLength >= minY:
#Success.
return word
else:
#No fit.
return 0
def output_matches(dirPath, successList): #Writes results to results.txt.
file = open(dirPath + '/results.txt', 'w')
for word in successList:
file.write(word)
file.write('\n')
file.close()
def total_matches(total, lengthOfDict): #Return total matches/successes.
totalMatches = 'Total matches: ' + str(total) + '/' + str(lengthOfDict) + ' words.'
print totalMatches
def results_message(dirPath):
print 'Results written to ' + dirPath + '/results.txt.'
def get_font(dirPath): #Return fontName and check if exists.
while True:
try:
fontName = raw_input('Enter the font (Ext. not needed): ')
fontName = dirPath + '/fonts/' + fontName + '.ttf'
file = open(fontName)
file.close()
return fontName
break
except IOError:
print 'Font file not found, please try again!'
def get_redaction_box(): #Return redaction box coordinates.
while True:
try:
print 'Enter 4 coordinates of BBox as shown in object: [n, n, n, n]'
lowerLeft = raw_input('')
lowerRight = raw_input(lowerLeft + ', ')
upperLeft = raw_input(lowerLeft + ', ' + lowerRight + ', ')
upperRight = raw_input(lowerLeft + ', ' + lowerRight + ', ' + upperLeft + ', ')
print 'Coordinates: ' + '[' + lowerLeft + ', ' + lowerRight + ', ' + upperLeft + ', ' + upperRight + ']'
redactAreaX = round(float(upperLeft)) - round(float(lowerLeft))
redactAreaY = round(float(upperRight)) - round(float(lowerRight))
redactAreaX = int(redactAreaX)
redactAreaY = int(redactAreaY)
print 'X = ', redactAreaX, ', Y = ', redactAreaY
return redactAreaX, redactAreaY
break
except ValueError:
print 'Not a valid number, please try again.'
def get_font_size(): #Return fontSize and check if valid integer.
while True:
try:
fontSize = float(raw_input('Enter the font size: '))
fontSize = round(fontSize)
fontSize = int(fontSize)
if fontSize > 1 and fontSize < 51: #Font ok.
return fontSize
break
else: #Too small.
print 'Positive value required, try again.'
except ValueError:
print 'Not a valid number, please try again.'