Skip to content

Commit

Permalink
use a contextual --config prefix to resolve method resolution order
Browse files Browse the repository at this point in the history
  • Loading branch information
nwaughachukwuma committed Nov 15, 2024
1 parent ecd7251 commit 719c1bd
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 44 deletions.
43 changes: 14 additions & 29 deletions src/web_search/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@


class GoogleSearch:
config: GoogleSearchConfig
google_config: GoogleSearchConfig

def __init__(self, config: GoogleSearchConfig | None = None):
self.config = config if config else GoogleSearchConfig()
def __init__(self, google_config: GoogleSearchConfig | None = None):
self.google_config = google_config if google_config else GoogleSearchConfig()

async def _compile_google_search(self, query: str):
results = await self._google_search(query)
Expand All @@ -26,28 +26,24 @@ async def _google_search(self, query: str, **kwargs):
"""
params = {
"q": unquote(query),
"key": self.config.api_key,
"cx": self.config.cse_id,
"key": self.google_config.api_key,
"cx": self.google_config.cse_id,
"num": 5,
}
params.update(kwargs)
headers = {"Referer": self.config.app_domain}
headers = {"Referer": self.google_config.app_domain or ""}

async with httpx.AsyncClient() as client:
response = await client.get(
GOOGLE_SEARCH_URL, params=params, headers=headers
)
response = await client.get(GOOGLE_SEARCH_URL, params=params, headers=headers)
response.raise_for_status()

json_data = response.json()

items = json_data.get("items", [])[: self.config.max_results]
items = json_data.get("items", [])[: self.google_config.max_results]
result = await self.extract_relevant_items(items)
return result

async def extract_relevant_items(
self, search_results: List[Dict[str, Any]]
) -> List[SearchResult]:
async def extract_relevant_items(self, search_results: List[Dict[str, Any]]) -> List[SearchResult]:
"""
Extract relevant items from the search results
"""
Expand Down Expand Up @@ -77,22 +73,15 @@ def _is_valid_url(self, url: str) -> bool:
".rar",
)
invalid_domains = ("youtube.com", "vimeo.com", "facebook.com", "twitter.com")
return not (
url.endswith(invalid_extensions)
or any(domain in url for domain in invalid_domains)
)
return not (url.endswith(invalid_extensions) or any(domain in url for domain in invalid_domains))

async def _process_search_item(
self, url: str, item: Dict, char_limit=2000
) -> SearchResult | None:
async def _process_search_item(self, url: str, item: Dict, char_limit=2000) -> SearchResult | None:
"""
Process and fetch the result of a single search item url
"""
try:
content = await self._scrape_page_content(url)
return SearchResult(
url=url, title=item.get("title", ""), preview=content[:char_limit]
)
return SearchResult(url=url, title=item.get("title", ""), preview=content[:char_limit])
except Exception:
return None

Expand All @@ -107,9 +96,7 @@ async def _scrape_page_content(self, url: str) -> str:

soup = BeautifulSoup(response.text, "lxml")
# Remove unwanted elements
for element in soup.find_all(
["script", "style", "nav", "header", "footer", "ads"]
):
for element in soup.find_all(["script", "style", "nav", "header", "footer", "ads"]):
element.decompose()

content_elements = soup.find_all(
Expand All @@ -123,9 +110,7 @@ async def _scrape_page_content(self, url: str) -> str:

# Extract text from found elements
content = "\n".join(
element.get_text(strip=True)
for element in content_elements
if element.get_text(strip=True)
element.get_text(strip=True) for element in content_elements if element.get_text(strip=True)
)

# If still no content, try getting all text
Expand Down
18 changes: 8 additions & 10 deletions src/web_search/knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@


class KnowledgeSearch:
config: KnowledgeSearchConfig
knowledge_config: KnowledgeSearchConfig

def __init__(self, config: KnowledgeSearchConfig | None = None):
self.config = config if config else KnowledgeSearchConfig()
def __init__(self, knowledge_config: KnowledgeSearchConfig | None = None):
self.knowledge_config = knowledge_config if knowledge_config else KnowledgeSearchConfig()

async def fetch_knowledge(self, query: str):
"""
Expand All @@ -31,7 +31,7 @@ async def fetch_knowledge(self, query: str):
if isinstance(result, list):
sources.extend(result)

sources = sources[: self.config.max_sources]
sources = sources[: self.knowledge_config.max_sources]
return "\n\n".join(str(source) for source in sources if source.preview)

async def _compile_wikipedia(self, query: str) -> str:
Expand All @@ -48,7 +48,7 @@ async def _search_wikipedia(self, query: str) -> list[SearchResult]:
"""
try:
sources: list[SearchResult] = []
search_results = wikipedia.search(query, results=self.config.max_results)
search_results = wikipedia.search(query, results=self.knowledge_config.max_results)

for title in search_results:
try:
Expand All @@ -60,9 +60,7 @@ async def _search_wikipedia(self, query: str) -> list[SearchResult]:
if not preview:
continue

sources.append(
SearchResult(url=page.url, title=page.title, preview=preview)
)
sources.append(SearchResult(url=page.url, title=page.title, preview=preview))
except wikipedia.exceptions.DisambiguationError:
continue
except wikipedia.exceptions.PageError:
Expand All @@ -81,7 +79,7 @@ async def _search_arxiv_papers(self, query: str) -> list[SearchResult]:
params = {
"search_query": f"all:{query}",
"start": 0,
"max_results": self.config.max_results,
"max_results": self.knowledge_config.max_results,
"sortBy": "relevance",
"sortOrder": "descending",
}
Expand Down Expand Up @@ -129,7 +127,7 @@ def _extract_relevant_wiki_sections(self, content: str) -> str:

result = ""
for p in cleaned_paragraphs:
if len(result + p) <= self.config.max_preview_chars:
if len(result + p) <= self.knowledge_config.max_preview_chars:
result += p + "\n\n"
else:
break
Expand Down
10 changes: 5 additions & 5 deletions src/web_search/search.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import asyncio
from typing import Any, Coroutine, List

from .config import WebSearchConfig
from .google import GoogleSearch
from .knowledge import KnowledgeSearch
from .config import WebSearchConfig


class WebSearch(GoogleSearch, KnowledgeSearch):
def __init__(self, config: WebSearchConfig | None = None):
self.config = config if config else WebSearchConfig()
ws_config = config if config else WebSearchConfig()

self.sources = self.config.sources
GoogleSearch.__init__(self, self.config.google_config)
KnowledgeSearch.__init__(self, config=self.config.knowledge_config)
GoogleSearch.__init__(self, google_config=ws_config.google_config)
KnowledgeSearch.__init__(self, knowledge_config=ws_config.knowledge_config)
self.sources = ws_config.sources

async def search(self, query: str):
"""
Expand Down

0 comments on commit 719c1bd

Please sign in to comment.