forked from nainiayoub/pdf-text-data-extractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
48 lines (39 loc) · 1.95 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import streamlit as st
from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF
st.markdown("""
## :outbox_tray: Text data extractor: PDF to Text
[![Twitter](https://img.shields.io/twitter/url?label=Twitter&style=social&url=https%3A%2F%2Ftwitter.com%2Fnainia_ayoub)](https://www.twitter.com/nainia_ayoub)
[![Linkedin](https://img.shields.io/twitter/url?label=Linkedin&logo=linkedin&style=social&url=https%3A%2F%2Fwww.linkedin.com%2Fin%2Fayoub-nainia%2F%3Flocale%3Den_US)](https://www.linkedin.com/in/ayoub-nainia/?locale=en_US)
[![GitHub](https://img.shields.io/twitter/url?label=Github&logo=GitHub&style=social&url=https%3A%2F%2Fgithub.com%2Fnainiayoub)](https://github.com/nainiayoub)
Before extracting information from a document, we have to extract text data first.
Hence, this PDF text data extractor was created.
""")
with st.sidebar:
st.title("PDF to Text")
textOutput = st.selectbox(
"How do you want your output data?",
('One text file (.txt)', 'Text file per page (ZIP)'))
pdf_file = st.file_uploader("Load your PDF file", type="pdf")
if pdf_file:
# display document
with st.expander("Display document"):
displayPDF(pdf_file)
# pdf to text
if textOutput == 'One text file (.txt)':
text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
totalPages = str(nbPages)+" pages in total."
st.info(totalPages)
st.download_button("Download txt file", text_data_f)
else:
text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
totalPages = str(nbPages)+" pages in total."
st.info(totalPages)
zipPath = save_pages(text_data)
# download text data
with open(zipPath, "rb") as fp:
btn = st.download_button(
label="Download ZIP (txt)",
data=fp,
file_name="pdf_to_txt.zip",
mime="application/zip"
)