-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser_paylah.py
109 lines (88 loc) · 3.19 KB
/
parser_paylah.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import csv
import json
import os
from pathlib import Path
from re import compile
from bs4 import BeautifulSoup
def parse_paylah_html(html_str: str) -> dict:
# make soup
soup = BeautifulSoup(html_str, "html.parser")
data_dict = {}
### Find the transaction ID
tag_found = soup.find(string=compile("Transaction Ref:.*"))
if tag_found:
txn_id = tag_found.string.strip()
txn_id = txn_id.split(":")[1].strip()
else:
txn_id = "NA"
data_dict["txn_type"] = "PayLah"
data_dict["txn_id"] = txn_id
### Find the main data table
tag_found = soup.find(string="Date & Time:")
td_tags = tag_found.find_parent("tbody").find_all("td") if tag_found else []
### Put all the content strings into a list
content_str_lst = []
for tag in td_tags:
try:
content_str_lst.append(tag.string.strip())
except:
pass
if len(content_str_lst) != 8:
# print(f"Expected 8 td tags, got {len(content_str_lst)}")
# print(content_str_lst)
return None
data_dict["txn_time"] = content_str_lst[1]
data_dict["txn_amount"] = content_str_lst[3]
data_dict["txn_from"] = content_str_lst[5]
data_dict["txn_to"] = content_str_lst[7]
# strip away 'SGD' from txn_amount
data_dict["txn_amount"] = data_dict["txn_amount"][3:]
# strip away date from txn_time
data_dict["txn_time"] = data_dict["txn_time"][6:12].strip()
# simplify txn_from
if "0920" in data_dict["txn_from"]:
data_dict["txn_from"] = "me"
return data_dict
def main(output_dir="output"):
output_dir = Path(output_dir)
paylah_dir = output_dir / "paylah"
paylah_files = os.listdir(paylah_dir)
paylah_files = [paylah_dir / i for i in paylah_files if i.endswith(".json")]
all_data_dicts = []
for paylah_file in paylah_files:
with open(paylah_file, "r") as f:
email_data = json.load(f)
paylah_html = email_data.get("body")
if email_data["subject"] != "Transaction Alerts":
print(f"Skipping '{paylah_file}' (likely not a transaction email)")
continue
data_dict = parse_paylah_html(paylah_html)
if not data_dict:
print(f"Skipping '{paylah_file}' (likely not a transaction email)")
continue
date_str = email_data["date"]
data_dict["txn_date"] = date_str
all_data_dicts.append(data_dict)
# sort by date
all_data_dicts.sort(key=lambda x: x["txn_date"])
out_json = output_dir / "master_paylah.json"
with out_json.open("w") as f:
json.dump(all_data_dicts, f, indent=4)
print(f"Saved {len(all_data_dicts)} transactions to master_paylah.json")
out_csv = output_dir / "master_paylah.csv"
with out_csv.open("w") as f:
fieldnames = [
"txn_type",
"txn_id",
"txn_date",
"txn_time",
"txn_amount",
"txn_from",
"txn_to",
]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_data_dicts)
print(f"Saved {len(all_data_dicts)} transactions to master_paylah.csv")
if __name__ == "__main__":
main()