no message

nux-ai · Mar 17, 2024 · 79b985e · 79b985e
1 parent d8cbfd0
commit 79b985e
Show file tree

Hide file tree

Showing 13 changed files with 32 additions and 48 deletions.
diff --git a/src/api/parse/controller.py b/src/api/parse/controller.py
@@ -1,29 +1,13 @@
 from fastapi import APIRouter, HTTPException, Body, Depends, Request, Path
 
-from .model import ParseFileRequest, SupportedModalities
+from .model import ParseFileRequest
 from .service import ParseHandler
 
 router = APIRouter()
 
 
-@router.post("/{modality}")
-async def parse_file(
-    request: Request,
-    request_body: ParseFileRequest,
-    modality: SupportedModalities = Path(...),
-):
-    parse_handler = ParseHandler(request.index_id)
-
-    if len(request_body.file_urls) == 0:
-        raise HTTPException(status_code=400, detail="No file_urls provided")
-
-    if len(request_body.file_urls) > 1:
-        raise HTTPException(
-            status_code=400, detail="Multiple file_urls are not supported yet"
-        )
-    try:
-        return await parse_handler.run_handler_once(
-            modality=modality, file_url=request_body.file_urls[0]
-        )
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))
+# @router.post("/{modality}")
+# async def parse_file(
+#     request: Request,
+#     request_body: ParseFileRequest
+# ):
diff --git a/src/api/parse/model.py b/src/api/parse/model.py
@@ -11,4 +11,4 @@ class SupportedModalities(str, Enum):
 
 
 class ParseFileRequest(BaseModel):
-    file_urls: Union[str, List[str]]
+    file_url: str
diff --git a/src/connectors/Dockerfile → src/listeners/Dockerfile b/src/connectors/Dockerfile → src/listeners/Dockerfile
diff --git a/src/connectors/README.md → src/listeners/README.md b/src/connectors/README.md → src/listeners/README.md
diff --git a/src/connectors/main.py → src/listeners/main.py b/src/connectors/main.py → src/listeners/main.py
diff --git a/src/connectors/poetry.lock → src/listeners/poetry.lock b/src/connectors/poetry.lock → src/listeners/poetry.lock
diff --git a/src/connectors/pyproject.toml → src/listeners/pyproject.toml b/src/connectors/pyproject.toml → src/listeners/pyproject.toml
diff --git a/src/connectors/sources/mongodb/service.py → src/listeners/sources/mongodb/service.py b/src/connectors/sources/mongodb/service.py → src/listeners/sources/mongodb/service.py
diff --git a/src/connectors/sources/postgres/model.py → src/listeners/sources/postgres/model.py b/src/connectors/sources/postgres/model.py → src/listeners/sources/postgres/model.py
diff --git a/src/connectors/sources/postgres/queries.sql → src/listeners/sources/postgres/queries.sql b/src/connectors/sources/postgres/queries.sql → src/listeners/sources/postgres/queries.sql
diff --git a/src/connectors/sources/postgres/service.py → src/listeners/sources/postgres/service.py b/src/connectors/sources/postgres/service.py → src/listeners/sources/postgres/service.py
diff --git a/src/connectors/utilities/transmit.py → src/listeners/utilities/transmit.py b/src/connectors/utilities/transmit.py → src/listeners/utilities/transmit.py
diff --git a/src/parsers/website/service.py b/src/parsers/website/service.py
@@ -19,12 +19,12 @@ def __init__(self, url, maxDepth) -> None:
         self.url = url
         self.maxDepth = maxDepth
         self.data = {
-            "Image": [],
-            "Video": [],
-            "Audio": [],
-            "PDF": [],
-            "HTML": [],
-            "Internal": [
+            "image": [],
+            "video": [],
+            "audio": [],
+            "pdf": [],
+            "html": [],
+            "internal": [
                 self.url,
             ],
         }
@@ -54,11 +54,11 @@ def isValid(self, url):
         if not bool(parsed.netloc) or not bool(parsed.scheme):
             return False
 
-        # Check if the same URL already exists in the Internal list
+        # Check if the same URL already exists in the internal list
         stripped_url = parsed.netloc + parsed.path
         stripped_url = stripped_url.lstrip("www.")  # remove www
         stripped_url = stripped_url.rstrip("/")  # remove backslash
-        if any(stripped_url in s for s in self.data["Internal"]):
+        if any(stripped_url in s for s in self.data["internal"]):
             return False
         return True
 
@@ -69,33 +69,33 @@ def extractImageUrls(self, soup, site):
             if ".png" or ".gif" or ".jpg" in url:
                 if "http" not in url:
                     url = "{}{}".format(site, url)
-                if url not in self.data["Image"]:
-                    self.data["Image"].append(url)
+                if url not in self.data["image"]:
+                    self.data["image"].append(url)
 
     def getData(self, url, soup):
         def addType(href):
             if (
                 ".png" in href
                 or ".gif" in href
                 or ".jpg" in href
-                and href not in self.data["Image"]
+                and href not in self.data["image"]
             ):
-                self.data["Image"].append(href)
-            elif ".html" in href and href not in self.data["HTML"]:
-                self.data["HTML"].append(href)
-            elif ".pdf" in href and href not in self.data["PDF"]:
-                self.data["PDF"].append(href)
-            elif ".mp3" in href and href not in self.data["Audio"]:
-                self.data["Audio"].append(href)
+                self.data["image"].append(href)
+            elif ".html" in href and href not in self.data["html"]:
+                self.data["html"].append(href)
+            elif ".pdf" in href and href not in self.data["pdf"]:
+                self.data["pdf"].append(href)
+            elif ".mp3" in href and href not in self.data["audio"]:
+                self.data["audio"].append(href)
             elif (
                 ".mp4" in href
                 or ".mpeg" in href
                 or ".wov" in href
                 or ".avi" in href
                 or ".mkv" in href
-                and href not in self.data["Video"]
+                and href not in self.data["video"]
             ):
-                self.data["Video"].append(href)
+                self.data["video"].append(href)
 
         domainName = urlparse(url).netloc
         for tag in soup.findAll("a"):
@@ -109,8 +109,8 @@ def addType(href):
             if not self.isValid(href):
                 continue
             addType(href)
-            if domainName in href and href not in self.data["Internal"]:
-                self.data["Internal"].append(href)
+            if domainName in href and href not in self.data["internal"]:
+                self.data["internal"].append(href)
 
     async def recursiveScrap(self, i):
         async def getPage(url):
@@ -141,13 +141,13 @@ async def getPage(url):
 
         if i > self.maxDepth:
             return
-        res, soup = await getPage(self.data["Internal"][i])
+        res, soup = await getPage(self.data["internal"][i])
         if not res:
             try:
-                self.extractImageUrls(soup, self.data["Internal"][i])
+                self.extractImageUrls(soup, self.data["internal"][i])
             except:
                 pass
-            self.getData(self.data["Internal"][i], soup)
+            self.getData(self.data["internal"][i], soup)
         print(i)
         i += 1
         await self.recursiveScrap(i)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,4 +11,4 @@ class SupportedModalities(str, Enum):


		class ParseFileRequest(BaseModel):
		file_urls: Union[str, List[str]]
		file_url: str