-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
62 lines (46 loc) · 1.68 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from hash_class import MyHashTable
import os
import re
# Reference slides: https://docs.google.com/presentation/d/14N_E4uZL6XqT4bejTkRYDq7sps-hBxN8VQTGXJCpGxg/edit#slide=id.p
# 1. Implement a hash function.
# - def hashcode(arg: string) -> int
# 2. Implement a hash table with chaining
# - insert(key, value)
# - lookup(key) -> value
# - delete(key)
# __len__ () => int
# - resize(int)
# - items() -> list or generator of [key, value] pairs
# 3. Use hash table to count word frequencies in a book.
# djb2 hash function implementation.
book_path = os.path.join(os.path.dirname(__file__), './books/miller-girl.txt')
def process_data(file):
if not os.path.isfile(file):
raise FileNotFoundError("File not found")
if not os.path.isabs(file):
raise ValueError(f"{file} is not an absolute path.")
with open(file, 'r') as f:
data = f.read()
# remove punctuations with regex
data = re.sub(r'[^\w\s]', '', data)
data = data.lower()
return data
def frequency_counter(table_items_fn):
items = list(table_items_fn())
items.sort(key=lambda pair: pair[1], reverse=True)
print("Top 10 words: \t\n")
print("{:<15} {:<10}".format('Word', 'Frequency'))
print('-' * 25)
for word, frequency in items[:10]:
print("{:<15} {:<10}".format(word, frequency))
def main():
hash_table = MyHashTable(30000)
words = process_data(book_path).split()
for word in words:
try:
hash_table.insert(word, hash_table.get_value(word) + 1)
except KeyError: # if the word is not found, add it.
hash_table.insert(word, 1)
frequency_counter(hash_table.items)
if __name__ == "__main__":
main()