feat: optimized performances on player profiles parsing (#210)

TeKrop · Nov 5, 2024 · 9a602d0 · 9a602d0
1 parent 6620eaa
commit 9a602d0
Show file tree

Hide file tree

Showing 9 changed files with 185 additions and 97 deletions.
diff --git a/.env.dist b/.env.dist
@@ -6,6 +6,7 @@ APP_PORT=80
 APP_BASE_URL=https://overfast-api.tekrop.fr
 LOG_LEVEL=info
 STATUS_PAGE_URL=
+PROFILING=false
 
 # Rate limiting
 BLIZZARD_RATE_LIMIT_RETRY_AFTER=5

diff --git a/app/config.py b/app/config.py
@@ -42,6 +42,9 @@ class Settings(BaseSettings):
     # Optional, status page URL if you have any to provide
     status_page_url: str | None = None
 
+    # Profiling with pyinstrument, for debug purposes
+    profiling: bool = False
+
     ############
     # RATE LIMITING
     ############

diff --git a/app/main.py b/app/main.py
@@ -1,12 +1,13 @@
 """Project main file containing FastAPI app and routes definitions"""
 
+from collections.abc import Callable
 from contextlib import asynccontextmanager, suppress
 
 from fastapi import FastAPI, Request
 from fastapi.exceptions import ResponseValidationError
 from fastapi.openapi.docs import get_redoc_html, get_swagger_ui_html
 from fastapi.openapi.utils import get_openapi
-from fastapi.responses import JSONResponse
+from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
 from starlette.exceptions import HTTPException as StarletteHTTPException
 
@@ -22,6 +23,10 @@
 from .players.commands.update_search_data_cache import update_search_data_cache
 from .roles import router as roles
 
+# pyinstrument won't be installed on production, that's why we're checking it here
+with suppress(ModuleNotFoundError):
+    from pyinstrument import Profiler
+
 
 @asynccontextmanager
 async def lifespan(_: FastAPI):  # pragma: no cover
@@ -176,6 +181,27 @@ async def overridden_swagger():
     return get_swagger_ui_html(**swagger_settings)
 
 
+# In case enabled in settings, add the pyinstrument profiler middleware
+if settings.profiling is True:
+    logger.info("Profiling is enabled")
+
+    @app.middleware("http")
+    async def profile_request(request: Request, call_next: Callable):
+        """Profile the current request"""
+        # if the `profile=true` HTTP query argument is passed, we profile the request
+        if request.query_params.get("profile", False):
+            # we profile the request along with all additional middlewares, by interrupting
+            # the program every 1ms1 and records the entire stack at that point
+            with Profiler(interval=0.001, async_mode="enabled") as profiler:
+                await call_next(request)
+
+            # we dump the profiling into a file
+            return HTMLResponse(profiler.output_html())
+
+        # Proceed without profiling
+        return await call_next(request)
+
+
 # Add application routers
 app.include_router(heroes.router, prefix="/heroes")
 app.include_router(roles.router, prefix="/roles")

diff --git a/app/parsers.py b/app/parsers.py
@@ -2,7 +2,7 @@
 from typing import ClassVar
 
 import httpx
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SoupStrainer
 from fastapi import status
 
 from .cache_manager import CacheManager
@@ -137,8 +137,7 @@ class HTMLParser(APIParser):
     @property
     def root_tag_params(self) -> dict:
         """Returns the BeautifulSoup params kwargs, used to find the root Tag
-        on the page which will be used for searching and hashing (for cache). We
-        don't want to calculate an hash and do the data parsing on all the HTML.
+        on the page which will be used for searching data.
         """
         return {"name": "main", "class_": "main-content", "recursive": False}
 
@@ -147,9 +146,10 @@ def store_response_data(self, response: httpx.Response) -> None:
         self.create_bs_tag(response.text)
 
     def create_bs_tag(self, html_content: str) -> None:
-        self.root_tag = BeautifulSoup(html_content, "lxml").body.find(
-            **self.root_tag_params,
-        )
+        soup_strainer = SoupStrainer(**self.root_tag_params)
+        self.root_tag = BeautifulSoup(
+            html_content, "lxml", parse_only=soup_strainer
+        ).main
 
 
 class JSONParser(APIParser):