-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfsearch.py
55 lines (47 loc) · 1.65 KB
/
pdfsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
import os
def parse(path, keyword):
with open(path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
res_man = PDFResourceManager()
lp = LAParams()
device = PDFPageAggregator(res_man, laparams=lp)
interpreter = PDFPageInterpreter(res_man, device)
index = 0
result = []
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
index += 1
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
text = x.get_text()
if text.__contains__(keyword):
result.append(index)
result = list(set(result))
result = sorted(result)
return result
def file_name(file_dir):
files = []
for file in os.walk(file_dir):
files.append(file) # os.walk()所在目录的所有非目录文件名
return files
if __name__ == '__main__':
with open('./search.log', 'w') as f:
for i in file_name('./556')[0][2]:
log = parse('./556/'+i, 'KEAP1')
if log:
for j in log:
f.write(i+' '+str(j)+'\n')