Skip to content

Commit

Permalink
no message
Browse files Browse the repository at this point in the history
  • Loading branch information
esteininger committed Mar 17, 2024
1 parent d8cbfd0 commit 79b985e
Show file tree
Hide file tree
Showing 13 changed files with 32 additions and 48 deletions.
28 changes: 6 additions & 22 deletions src/api/parse/controller.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,13 @@
from fastapi import APIRouter, HTTPException, Body, Depends, Request, Path

from .model import ParseFileRequest, SupportedModalities
from .model import ParseFileRequest
from .service import ParseHandler

router = APIRouter()


@router.post("/{modality}")
async def parse_file(
request: Request,
request_body: ParseFileRequest,
modality: SupportedModalities = Path(...),
):
parse_handler = ParseHandler(request.index_id)

if len(request_body.file_urls) == 0:
raise HTTPException(status_code=400, detail="No file_urls provided")

if len(request_body.file_urls) > 1:
raise HTTPException(
status_code=400, detail="Multiple file_urls are not supported yet"
)
try:
return await parse_handler.run_handler_once(
modality=modality, file_url=request_body.file_urls[0]
)
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
# @router.post("/{modality}")
# async def parse_file(
# request: Request,
# request_body: ParseFileRequest
# ):
2 changes: 1 addition & 1 deletion src/api/parse/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ class SupportedModalities(str, Enum):


class ParseFileRequest(BaseModel):
file_urls: Union[str, List[str]]
file_url: str
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
50 changes: 25 additions & 25 deletions src/parsers/website/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ def __init__(self, url, maxDepth) -> None:
self.url = url
self.maxDepth = maxDepth
self.data = {
"Image": [],
"Video": [],
"Audio": [],
"PDF": [],
"HTML": [],
"Internal": [
"image": [],
"video": [],
"audio": [],
"pdf": [],
"html": [],
"internal": [
self.url,
],
}
Expand Down Expand Up @@ -54,11 +54,11 @@ def isValid(self, url):
if not bool(parsed.netloc) or not bool(parsed.scheme):
return False

# Check if the same URL already exists in the Internal list
# Check if the same URL already exists in the internal list
stripped_url = parsed.netloc + parsed.path
stripped_url = stripped_url.lstrip("www.") # remove www
stripped_url = stripped_url.rstrip("/") # remove backslash
if any(stripped_url in s for s in self.data["Internal"]):
if any(stripped_url in s for s in self.data["internal"]):
return False
return True

Expand All @@ -69,33 +69,33 @@ def extractImageUrls(self, soup, site):
if ".png" or ".gif" or ".jpg" in url:
if "http" not in url:
url = "{}{}".format(site, url)
if url not in self.data["Image"]:
self.data["Image"].append(url)
if url not in self.data["image"]:
self.data["image"].append(url)

def getData(self, url, soup):
def addType(href):
if (
".png" in href
or ".gif" in href
or ".jpg" in href
and href not in self.data["Image"]
and href not in self.data["image"]
):
self.data["Image"].append(href)
elif ".html" in href and href not in self.data["HTML"]:
self.data["HTML"].append(href)
elif ".pdf" in href and href not in self.data["PDF"]:
self.data["PDF"].append(href)
elif ".mp3" in href and href not in self.data["Audio"]:
self.data["Audio"].append(href)
self.data["image"].append(href)
elif ".html" in href and href not in self.data["html"]:
self.data["html"].append(href)
elif ".pdf" in href and href not in self.data["pdf"]:
self.data["pdf"].append(href)
elif ".mp3" in href and href not in self.data["audio"]:
self.data["audio"].append(href)
elif (
".mp4" in href
or ".mpeg" in href
or ".wov" in href
or ".avi" in href
or ".mkv" in href
and href not in self.data["Video"]
and href not in self.data["video"]
):
self.data["Video"].append(href)
self.data["video"].append(href)

domainName = urlparse(url).netloc
for tag in soup.findAll("a"):
Expand All @@ -109,8 +109,8 @@ def addType(href):
if not self.isValid(href):
continue
addType(href)
if domainName in href and href not in self.data["Internal"]:
self.data["Internal"].append(href)
if domainName in href and href not in self.data["internal"]:
self.data["internal"].append(href)

async def recursiveScrap(self, i):
async def getPage(url):
Expand Down Expand Up @@ -141,13 +141,13 @@ async def getPage(url):

if i > self.maxDepth:
return
res, soup = await getPage(self.data["Internal"][i])
res, soup = await getPage(self.data["internal"][i])
if not res:
try:
self.extractImageUrls(soup, self.data["Internal"][i])
self.extractImageUrls(soup, self.data["internal"][i])
except:
pass
self.getData(self.data["Internal"][i], soup)
self.getData(self.data["internal"][i], soup)
print(i)
i += 1
await self.recursiveScrap(i)
Expand Down

0 comments on commit 79b985e

Please sign in to comment.