Skip to content
This repository has been archived by the owner on Nov 20, 2022. It is now read-only.

Commit

Permalink
parse two weeks of ipp menu
Browse files Browse the repository at this point in the history
  • Loading branch information
srehwald committed Nov 27, 2017
1 parent cbbd0de commit add3b31
Showing 1 changed file with 21 additions and 18 deletions.
39 changes: 21 additions & 18 deletions src/menu_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,28 +213,31 @@ def parse(self, location):
tree = html.fromstring(page.content)
# get url of current pdf menu
xpath_query = tree.xpath("//a[contains(text(), 'KW-')]/@href")
pdf_url = xpath_query[0] if len(xpath_query) >= 1 else None

if pdf_url is None:
if len(xpath_query) < 1:
return None

# Example PDF-name: KW-48_27.11-01.12.10.2017-3.pdf
pdf_name = pdf_url.split("/")[-1]
year = int(pdf_name.replace(".pdf","").split(".")[-1].split("-")[0])
week_number = int(pdf_name.split("_")[0].replace("KW-","").lstrip("0"))
menus = {}
# consider first two pdfs found (i.e. run for current and next week)
for pdf_url in xpath_query[:2]:
# Example PDF-name: KW-48_27.11-01.12.10.2017-3.pdf
pdf_name = pdf_url.split("/")[-1]
year = int(pdf_name.replace(".pdf","").split(".")[-1].split("-")[0])
week_number = int(pdf_name.split("_")[0].replace("KW-","").lstrip("0"))

with tempfile.NamedTemporaryFile() as temp_pdf:
# download pdf
response = requests.get(pdf_url)
temp_pdf.write(response.content)
with tempfile.NamedTemporaryFile() as temp_txt:
# convert pdf to text by calling pdftotext; only convert first page to txt (-l 1)
call(["pdftotext", "-l", "1", "-layout", temp_pdf.name, temp_txt.name])
with open(temp_txt.name, 'r') as myfile:
# read generated text file
data = myfile.read()
menus.update(self.get_menus(data, year, week_number))

with tempfile.NamedTemporaryFile() as temp_pdf:
# download pdf
response = requests.get(pdf_url)
temp_pdf.write(response.content)
with tempfile.NamedTemporaryFile() as temp_txt:
# convert pdf to text by calling pdftotext; only convert first page to txt (-l 1)
call(["pdftotext", "-l", "1", "-layout", temp_pdf.name, temp_txt.name])
with open(temp_txt.name, 'r') as myfile:
# read generated text file
data = myfile.read()
menus = self.get_menus(data, year, week_number)
return menus
return menus

def get_menus(self, text, year, week_number):
menus = {}
Expand Down

0 comments on commit add3b31

Please sign in to comment.