-
Notifications
You must be signed in to change notification settings - Fork 1
/
ingest.py
166 lines (145 loc) · 5.44 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from sys import argv
from time import sleep
from requests import get, exceptions
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from constants import store, embeddingCollection, rockset as rs
from queries import link_exists as link_exists_query
if "--reset" in argv:
if embeddingCollection.exists():
embeddingCollection.delete()
while embeddingCollection.exists():
sleep(1)
embeddingCollection.create()
while not embeddingCollection.exists():
sleep(1)
while not embeddingCollection.is_ready():
sleep(1)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 120,
length_function = len,
add_start_index = True,
)
class LinkNode():
def __init__(self, link, next=None):
self.link = link
self.next = next
class LinkQueue():
def __init__(self, init_value=None):
self.first = LinkNode(init_value, None) if init_value is not None else None
self.last = self.first
def _add(self, link):
node = LinkNode(link)
if self.first is None and self.last is None: # empty queue
self.first = node
else:
self.last.next = node
self.last = node
def remove(self):
if self.first is self.last: # one item in queue
link = self.first.link
self.first = None
self.last = None
return link
prev_first = self.first
self.first = self.first.next
return prev_first.link
def is_empty(self):
return self.first is None
def add_elem_links(self, a_elems):
for i in a_elems:
self._add(i if i.__class__ is str else i["href"])
def __str__(self) -> str:
if self.is_empty():
return "[]"
res = ""
curr = self.first
while curr is not None:
res += f"{curr.link}, "
curr = curr.next
return f"[{res[:-2]}]"
class Scraper():
def _cleanse(self, link):
paramLoc = link.find("?")
if paramLoc > 0:
link = link[:paramLoc]
hashLoc = link.find("#")
if hashLoc > 0:
link = link[:hashLoc]
if link.startswith("/"):
link = "https://zelda.fandom.com" + link
return link
def _is_valid(self, link: str):
return (
not link.startswith("#") and
(link.startswith("https://zelda.fandom.com/wiki") or link.startswith("/")) and
not ":Log" in link and
not (":AbuseLog" in link) and
not "talk" in link.lower() and
not "ListFiles" in link and
not "Image_Requests" in link and
not "Contributions" in link and
not ":Search" in link and
not "User" in link and
not "AbuseFilter" in link and
not "Gallery" in link and
not "Special" in link and
not "Artwork" in link and
not "Guidelines" in link and
not "Help" in link and
not "Template" in link and
not "Community:" in link and
not ".png" in link and
not ".jpg" in link
)
def _is_category(self, link):
return "Category:" in link
def _scrape(self, link, add_embedding):
soup = BeautifulSoup(get(link).text, "html.parser")
if add_embedding:
print(f"Adding document for {link}")
if self._is_category(link):
# we do not need to generate embeddings for this page,
# but we still need to add it to the collection to
# make sure we don't scrape it again
embeddingCollection.add_doc({
"source": link,
"embedding": None
})
else:
page_title = soup.find("title").get_text()
page_text = soup.find(class_="page__main").get_text().replace("\n\n", "\n")
docs = text_splitter.create_documents([page_text],[{"source": link}])
store.add_texts(
texts=[f"This information is about {page_title}. {doc.page_content}" for doc in docs],
metadatas=[doc.metadata for doc in docs]
)
return soup
def _has_been_scraped(self, link):
return len(
rs.Queries.query(
sql=self.link_exists(link)
).results
) > 0
def __init__(self, starting_links):
self.link_exists = link_exists_query
self.first = True
links = LinkQueue()
links.add_elem_links(starting_links)
i = len(starting_links)
while not links.is_empty():
curr_link = self._cleanse(links.remove())
if self.first or (self._is_valid(curr_link) and not self._has_been_scraped(curr_link)):
try:
soup = self._scrape(curr_link, not self.first)
except exceptions.RequestException as e:
continue # skip
links.add_elem_links(soup.find_all("a", {"href": lambda value: value}))
else:
print(f"skipping {curr_link}")
i = i - 1
if i <= 1:
self.first = False
if __name__ == "__main__":
Scraper(["https://zelda.fandom.com/wiki/Main_Page"])