-
Notifications
You must be signed in to change notification settings - Fork 2
/
funcs.py
167 lines (139 loc) · 7.58 KB
/
funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import requests
from bs4 import BeautifulSoup
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # baseline
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-dot-v1') # seems to perform better than all-MiniLM-L6-v2
def get_page_summary(wiki_page):
"""
Retrieves a brief summary of a given Wikipedia page.
This function takes a Wikipedia page object and returns the summary of the page. However, rather than
returning the entire summary, it returns only the first few lines. This is particularly useful for
getting a quick overview or introduction to the page's content without needing to process the entire
summary text.
Parameters
----------
wiki_page : WikipediaPage object
A Wikipedia page object from which the summary is to be extracted. The object should have a 'summary'
attribute containing the text of the page's summary.
Returns
-------
str
A string containing the first few lines of the Wikipedia page's summary. The exact number of lines
returned is set to 5 in this implementation.
"""
# return just the first few lines if there are multiple
return ". ".join(wiki_page.summary.split("\n")[:5])
def get_random_wiki_page(wiki_wiki):
"""
Selects a random Wikipedia page that meets certain validity criteria.
This function repeatedly requests random Wikipedia pages until it finds one that satisfies specific
criteria: the title should not start with certain prefixes (like "Template:", "List of", etc.), should
not contain certain unwanted characters, and must contain at least one alphabetical character. The
function also checks if the page has a reasonable summary (at least 20 words) before accepting it.
Returns
-------
str
The title of a valid random Wikipedia page.
"""
wiki_title = None
while True:
url = "https://en.wikipedia.org/wiki/Special:Random"
response = requests.get(url, timeout = 30, allow_redirects = True)
final_url = response.url
wiki_title = final_url.split("wiki/")[-1]
is_valid_title = True
# various unwanted prefixes
bad_prefixes = ["list of", "history of", "Template:", "Wikipedia:", "Category:", "Portal:", "Talk:", "Template talk:"]
# check for unwanted chars
for char in "[]{}:%":
if char in wiki_title:
is_valid_title = False
# validation criteria
starts_with_bad_prefix = any(wiki_title.lower().startswith(prefix.lower()) for prefix in bad_prefixes)
contains_alpha = any(char.isalpha() for char in wiki_title)
is_valid_title = not starts_with_bad_prefix and contains_alpha
if is_valid_title:
# check if a reasonable page summary is present (at least 20 words)
summary = get_page_summary(wiki_wiki.page(wiki_title))
if len(summary.split()) > 20:
break
return wiki_title
def validate_pages(wiki_page):
"""
Filters and validates the linked pages from a given Wikipedia page.
This function takes a Wikipedia page object and extracts all the links (or references to other Wikipedia
pages) from it. It then filters out unwanted links based on predefined criteria, such as links with
certain prefixes (like "Template:", "Wikipedia:", etc.) and links that do not contain any alphabetical
characters. The purpose is to retain only relevant and potentially useful page links for further processing.
Parameters
----------
wiki_page : WikipediaPage object
A Wikipedia page object from which the links are to be extracted and validated. The object is
expected to have a 'links' attribute containing a dictionary of linked page titles.
Returns
-------
list
A list of validated linked page titles. The titles in this list do not include any of the unwanted
prefixes and contain at least one alphabetical character.
"""
# get all links
links = list(wiki_page.links.keys())
# various unwanted prefixes
bad_prefixes = [
"list of", "history of", "Template:", "Wikipedia:",
"Category:", "Portal:", "Talk:", "Template talk:",
"Special:", "Help:", "File:", "Main_Page",
"User:", "User_talk:", "File_talk:", "MediaWiki:",
"MediaWiki_talk:", "Module:", "Module_talk:", "Draft:",
"Draft_talk:", "TimedText:", "TimedText_talk:", "Book:",
"Book_talk:", "Education_Program:", "Education_Program_talk:",
"Education_Program_talk:", "Course:", "Course_talk:", "Draft:",
"Special:", "Special_talk:"
]
links = [link for link in links
if not any(link.lower().startswith(prefix.lower()) for prefix in bad_prefixes)
and any(char.isalpha() for char in link) # at least one alpha char
]
return links
def get_most_similar_strings(reference_string: str, candidates_list: list[str], n = 10):
"""
Identifies the most similar strings to a reference string from a list of candidate strings.
This function computes the similarity between a reference string and each string in the candidate list.
It uses a model to generate embeddings for the reference and candidate strings, and then calculates
the cosine similarity between the reference embedding and each candidate embedding. The function
returns the top 'n' most similar strings and their similarity scores.
Parameters
----------
reference_string : str
The reference string to which the similarity of candidate strings is to be compared.
candidates_list : list[str]
A list of candidate strings from which the most similar ones to the reference string are identified.
n : int, optional
The number of most similar strings to return. Defaults to 10.
Returns
-------
tuple of (list, list)
A tuple containing two lists: the first list contains the top 'n' most similar strings from the
candidates list, and the second list contains their corresponding similarity scores. The similarity
scores are in the range [0, 1], where 1 indicates perfect similarity.
"""
reference_embedding = model.encode([reference_string])[0]
encoded_strings = model.encode(candidates_list)
topics_embs = {topic: emb for topic, emb in zip(candidates_list, encoded_strings)}
similarities = [1 - cosine(reference_embedding, encoded_str) for encoded_str in encoded_strings]
most_similar_indices = np.argsort(similarities)[::-1][:n]
return topics_embs, [candidates_list[i] for i in most_similar_indices], [similarities[i] for i in most_similar_indices]
def search_wiki(search_term):
"""Search common name for search term and returns most relevant Wiki Page"""
search_url = f"https://en.wikipedia.org/w/index.php?search={'+'.join(search_term.split())}&title=Special:Search&profile=advanced&fulltext=1&ns0=1"
soup = BeautifulSoup(requests.get(search_url, timeout=30).content, "html.parser")
bad_prefixes = ["list of", "history of", "Template:", "Wikipedia:", "Category:", "Portal:", "Talk:", "Template talk:"]
for result in soup.find_all("div", class_ = "mw-search-result-heading"):
if result.a:
if result.a['href']:
text = result.a['href'].replace("/wiki/", "").strip()
starts_with_bad_prefix = any(text.lower().replace("_", " ").startswith(prefix.lower()) for prefix in bad_prefixes)
if not starts_with_bad_prefix:
return text