-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_to_txt.txt
66 lines (52 loc) · 1.98 KB
/
pdf_to_txt.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import io
import requests
import openpyxl
import os,sys
import pandas as pd
import numpy as np
import csv
import PyPDF2
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
print (os.getcwd())# First go to the required directory
os.chdir("S:\IFI Applications\Green_Bong_Makerfest" )
# Then print current working directory
#print ("Current working dir : %s" % os.getcwd())
print (os.getcwd())
def convert_pdf_to_txt(path):
'''Convert pdf content from a file path to text
:path the file path
'''
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
with io.StringIO() as retstr:
with TextConverter(rsrcmgr, retstr, codec=codec,
laparams=laparams) as device:
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
return retstr.getvalue()
if __name__ == "__main__":
for file in os.listdir("."):
try:
if file.endswith(".pdf"):
name, ext = file.split('.')
name = convert_pdf_to_txt(file)
# Write the text to a file
with open(file + ".txt","w" ,encoding="utf-8") as file:
file.write(name)
except:
pass