-
Notifications
You must be signed in to change notification settings - Fork 1
/
cache.py
378 lines (312 loc) · 13.6 KB
/
cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
import csv
import heapq
import os
from dataclasses import dataclass
from urllib.parse import quote
from urllib.request import urlopen
import utils
DEFAULT_ORIGIN_URL = "http://cs5700cdnorigin.ccs.neu.edu:8080"
ON_DISK = -1
NOT_CACHED = -2
APPEND = -3
@dataclass(order=True)
class LookupInfo:
"""
A simple data class that stores the views and buffer offset of
an article in the cache. Buffer offset can either be:
-2 ==> article not cached
-1 ==> article cached on disk
>=0 ==> buffer offset within in-memory buffer
"""
views: int
buffer_offset: int
# This is redundant, but sometimes we only have access to
# LookupInfo and need to determine what article it is.
article_name: str
def increment_views(self):
self.views += 1
class RepliCache:
"""
A dynamic caching layer for the CDN. Uses disk as well as memory for caching articles.
Tracks all the articles that are served via the CDN. If an article is not cached,
fetches it from the origin and attempts to cache it. Also tracks the disk and memory
usage and conservatively stops at 19MB for both.
"""
def __init__(self, origin_url: str = DEFAULT_ORIGIN_URL, test_mode: bool = False):
self.articles = {}
self.heap = []
self.buffer = []
self.origin_url = origin_url
self.disk_used = 0
self.max_disk_size = 19 * 1024 * 1024 # conservatively stopping at 19MB
self.memory_used = 0
self.max_memory_size = 19 * 1024 * 1024 # conservatively stopping at 19MB
self.test_mode = test_mode
self.build()
def build(self):
"""
We go through the CSV file as a starting point and then load
those articles from disk. We fit as many as we can into memory
and let the rest remain on disk. The ones that are moved to
memory get deleted from the disk.
This will lead to free space on disk that can be used for caching
articles that might get fetched from origin.
"""
with open("pageviews.csv") as article_file:
reader = csv.DictReader(article_file)
for row in reader:
article = quote(row["article"].replace(" ", "_"))
views = int(row["views"])
if os.path.exists(f"cache/{article}"):
compressed_article = self.get_from_disk_cache(article)
lookup_info = self.add(article, compressed_article, views)
if lookup_info.buffer_offset == NOT_CACHED:
# Don't break, continue so that any potentially
# smaller article down the line can be cached.
continue
# Remove from disk cache if loaded into memory
if not self.test_mode and lookup_info.buffer_offset != ON_DISK:
os.remove(f"cache/{article}")
else:
lookup_info = LookupInfo(views, NOT_CACHED, article)
self.articles[article] = lookup_info
print("Cache built")
def get(self, article: str) -> (bool, bytes):
"""
Attempt to fetch an article from the cache.
:param article: Request path for the article
:return: The boolean in the tuple indicates if the article exists, and
if it does, the second argument would be the actual bytes of the article.
"""
try:
return self.get_helper(article)
except: # Being super defensive about this
return True, utils.compress_article(self.fetch_from_origin(article))
def get_helper(self, article: str) -> (bool, bytes):
# Strip away leading slash from URLs
if article[0] == "/":
article = article[1:]
if not utils.is_url_encoded(article):
article = quote(article)
if article not in self.articles:
return False, None
lookup_info: LookupInfo = self.articles[article]
lookup_info.increment_views()
print(lookup_info)
if lookup_info.buffer_offset == NOT_CACHED:
# (CACHE MISS) fetch from origin and cache to disk
print(f"{article}: Not cached, fetching from origin")
# Fetch article from origin and compress it
article_raw_bytes = self.fetch_from_origin(article)
compressed_article = utils.compress_article(article_raw_bytes)
# Optimistically cache it to disk if we have the space
if (
self.add(
article, compressed_article, self.articles[article].views
).buffer_offset
== NOT_CACHED
):
self.attempt_evict_and_add(article, compressed_article)
return True, compressed_article
elif lookup_info.buffer_offset == ON_DISK:
# (DISK CACHE HIT) fetch from disk and see if it qualifies for promotion
print(f"{article}: Serving from disk cache")
compressed_article = self.get_from_disk_cache(article)
return True, compressed_article
else:
# (IN-MEMORY CACHE HIT) fetch from in-memory cache
print(f"{article}: Serving from in-memory cache")
return True, self.buffer[lookup_info.buffer_offset]
def add(self, article: str, article_raw_bytes: bytes, views: int) -> LookupInfo:
"""
Attempts to add a new article to the cache.
:param article: Request path of article to add
:param article_raw_bytes: Raw bytes representing the article itself
:param views: Number of page views
:return:
"""
buffer_offset = self.add_to_in_memory_cache(article_raw_bytes)
if buffer_offset == NOT_CACHED:
buffer_offset = self.add_to_disk_cache(article, article_raw_bytes)
lookup_info = LookupInfo(views, buffer_offset, article)
if lookup_info.buffer_offset != NOT_CACHED:
self.heap.append(lookup_info)
self.articles[article] = lookup_info
return lookup_info
def add_to_in_memory_cache(
self, article_raw_bytes: bytes, buffer_offset: int = APPEND
) -> int:
"""
Attempts to add an article to the in-memory cache.
"""
if self.fits_in_memory_cache(article_raw_bytes):
self.memory_used += len(article_raw_bytes)
if buffer_offset == APPEND:
self.buffer.append(article_raw_bytes)
return len(self.buffer) - 1
else:
self.buffer[buffer_offset] = article_raw_bytes
return buffer_offset
return NOT_CACHED
def remove_from_in_memory_cache(self, article: str) -> int:
"""
Since buffer is a list, there's really nothing to "remove" here.
We can set the element at the offset in the buffer to None in the
hopes that the garbage collector reclaims that memory.
We return the buffer_offset that is now empty in the hopes that it
will be refilled with a new article. This method is only called by
attempt_promotion which does exactly that, so we're safe. Otherwise,
we would have holes in our buffer.
:return: buffer_offset: offset within the in-memory buffer that the article occupied
"""
lookup_info = self.articles[article]
self.memory_used -= len(self.buffer[lookup_info.buffer_offset])
self.buffer[lookup_info.buffer_offset] = None
return lookup_info.buffer_offset
def add_to_disk_cache(self, article: str, article_raw_bytes: bytes) -> int:
"""
Attempts to add an article to the disk cache.
"""
try:
with open(f"cache/{article}", "wb") as cache_file:
cache_file.write(article_raw_bytes)
self.disk_used += len(article_raw_bytes)
return ON_DISK
except IOError:
return NOT_CACHED
def remove_from_disk_cache(self, article: str):
filepath = f"cache/{article}"
file_size = os.path.getsize(filepath)
os.remove(filepath)
self.disk_used -= file_size
def attempt_evict_and_add(
self, article_name_to_promote: str, compressed_article_to_promote: bytes
) -> bool:
"""
Promotion: When the cache is full, and we need to add a new article to it.
In this situation, we find the article with minimum views from our cache and compare its
views with that of the new article. If the new article has more views, we perform further
checks to ensure the new article can fit in the cache (memory or disk, doesn't matter)
once the old one is evicted. If that check succeeds, we evict the one in the cache and add
the new one in its place.
To make the lookup of the article with minimum views faster, we employ a heap. We run heapify
lazily i.e. only before we need to grab the minimum element. At other times, we don't need the
heap. This results in O(logn) heapify complexity, which is not too bad for the total number of
articles we anticipate in the cache (~400). The logn runtime is justified because now we can
look up the article with minimum views in O(1).
"""
heapq.heapify(self.heap)
lookup_info_to_evict: LookupInfo = heapq.heappop(self.heap)
lookup_info_to_promote: LookupInfo = self.articles[article_name_to_promote]
eligible = lookup_info_to_evict.views < lookup_info_to_promote.views
if not eligible:
# If we can't evict, add back to the heap
self.heap.append(lookup_info_to_evict)
return False
else:
compressed_article_to_evict = self.buffer[
lookup_info_to_evict.buffer_offset
]
return self.attempt_evict_and_add_in_memory(
article_name_to_promote,
lookup_info_to_promote,
compressed_article_to_promote,
lookup_info_to_evict,
compressed_article_to_evict,
) or self.attempt_evict_and_add_on_disk(
article_name_to_promote,
lookup_info_to_promote,
compressed_article_to_promote,
lookup_info_to_evict,
compressed_article_to_evict,
)
def attempt_evict_and_add_in_memory(
self,
article_name_to_promote: str,
lookup_info_to_promote: LookupInfo,
compressed_article_to_promote: bytes,
lookup_info_to_evict: LookupInfo,
compressed_article_to_evict: bytes,
) -> bool:
"""
Checks if a swap is possible in memory. If yes,
performs the same and modifies the appropriate
LookupInfo data.
"""
swap_possible_in_memory = (
self.memory_used
- len(compressed_article_to_evict)
+ len(compressed_article_to_promote)
) <= self.max_memory_size
if not swap_possible_in_memory:
return False
lookup_info_to_promote.buffer_offset = lookup_info_to_evict.buffer_offset
lookup_info_to_evict.buffer_offset = NOT_CACHED
deleted_buffer_offset = self.remove_from_in_memory_cache(
lookup_info_to_evict.article_name
)
self.add_to_in_memory_cache(
compressed_article_to_promote,
deleted_buffer_offset,
)
print(
f"Promoted {article_name_to_promote}, evicted {lookup_info_to_evict.article_name}"
)
return True
def attempt_evict_and_add_on_disk(
self,
article_name_to_promote: str,
lookup_info_to_promote: LookupInfo,
compressed_article_to_promote: bytes,
lookup_info_to_evict: LookupInfo,
compressed_article_to_evict: bytes,
):
"""
Checks if a swap is possible on disk. If yes,
performs the same and modifies the appropriate
LookupInfo data.
"""
swap_possible_on_disk = (
self.disk_used
- len(compressed_article_to_promote)
+ len(compressed_article_to_evict)
) <= self.max_disk_size
if not swap_possible_on_disk:
return False
lookup_info_to_evict.buffer_offset = NOT_CACHED
lookup_info_to_promote.buffer_offset = ON_DISK
# This is the damn reason why we need to have a redundant
# reference to article_name within LookupInfo.
self.remove_from_disk_cache(lookup_info_to_evict.article_name)
self.add_to_disk_cache(
lookup_info_to_promote.article_name, compressed_article_to_promote
)
print(
f"Promoted {article_name_to_promote}, evicted {lookup_info_to_evict.article_name}"
)
return True
def fetch_from_origin(self, article: str) -> bytes:
with urlopen(f"{self.origin_url}/{article}") as response:
return response.read()
def fits_in_memory_cache(self, article_raw_bytes: bytes) -> bool:
return self.memory_used + len(article_raw_bytes) <= self.max_memory_size
@staticmethod
def get_from_disk_cache(article: str) -> bytes:
with open(f"cache/{article}", "rb") as fd:
return fd.read()
if __name__ == "__main__":
cache = RepliCache(test_mode=True)
print(cache.memory_used // 1024 / 1024, cache.disk_used // 1024 / 1024)
from timeit import default_timer as timer
start = timer()
cache.get("Rishi_Sunak")
end = timer()
print(f"{(end - start) * 1000:.2f}ms")
start = timer()
cache.get("Prabhas")
end = timer()
print(f"{(end - start) * 1000:.2f}ms")
start = timer()
cache.get("Jeff_Bridges")
end = timer()
print(f"{(end - start) * 1000:.2f}ms")