decodingml · alexandruvesa · Mar 22, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/.github/workflows/crawler.yml b/.github/workflows/crawler.yml
@@ -0,0 +1,60 @@
+name: Crawlers
+
+on: [pull_request]
+
+jobs:
+  build:
+    name: Build & Push Docker Image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v2
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+          mask-aws-account-id: 'false'
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v1
+        with:
+          mask-password: 'false'
+      - name: Build images & push to ECR
+        id: build-image
+        uses: docker/build-push-action@v4
+        with:
+          context: ./course/module1
+          file: ./course/module1/Dockerfile
+          tags: |
+            ${{ steps.login-ecr.outputs.registry }}/crawler:${{ github.sha }}
+            ${{ steps.login-ecr.outputs.registry }}/crawler:latest
+          push: true
+    outputs:
+      registry: ${{ steps.login-ecr.outputs.registry }}
+
+  deploy:
+    name: Deploy Crawler
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Deploy Lambda Image
+        id: deploy-lambda
+        run: |
+          echo "Updating lambda with new image version $ECR_REPOSITORY/crawler:$PROJECT_VERSION..."
+          aws lambda update-function-code \
+              --function-name "arn:aws:lambda:$AWS_REGION:$AWS_ACCOUNT_ID:function:crawler" \
+              --image-uri $ECR_REPOSITORY/crawler:$PROJECT_VERSION
+          echo "Successfully updated lambda"
+        env:
+          AWS_REGION: ${{ secrets.AWS_REGION }}
+          ECR_REPOSITORY: ${{ needs.build.outputs.registry }}
+          PROJECT_VERSION: ${{ github.sha }}
+          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
diff --git a/.gitignore b/.gitignore
@@ -149,6 +149,8 @@ dmypy.json
 # pytype static type analyzer
 .pytype/
 
+.idea
+
 # Cython debug symbols
 cython_debug/
 

diff --git a/module1/.env.example → course/module1/.env.example b/module1/.env.example → course/module1/.env.example
diff --git a/module1/.gitignore → course/module1/.gitignore b/module1/.gitignore → course/module1/.gitignore
diff --git a/course/module1/Dockerfile b/course/module1/Dockerfile
@@ -0,0 +1,63 @@
+FROM  public.ecr.aws/lambda/python:3.11 as build
+
+# Install chrome driver and browser
+RUN yum install -y unzip && \
+    curl -Lo "/tmp/chromedriver.zip" "https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip" && \
+    curl -Lo "/tmp/chrome-linux.zip" "https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F1135561%2Fchrome-linux.zip?alt=media" && \
+    unzip /tmp/chromedriver.zip -d /opt/ && \
+    unzip /tmp/chrome-linux.zip -d /opt/
+
+FROM  public.ecr.aws/lambda/python:3.11
+
+# Install the function's OS dependencies using yum
+RUN yum install -y \
+    atk \
+    wget \
+    git \
+    cups-libs \
+    gtk3 \
+    libXcomposite \
+    alsa-lib \
+    libXcursor \
+    libXdamage \
+    libXext \
+    libXi \
+    libXrandr \
+    libXScrnSaver \
+    libXtst \
+    pango \
+    at-spi2-atk \
+    libXt \
+    xorg-x11-server-Xvfb \
+    xorg-x11-xauth \
+    dbus-glib \
+    dbus-glib-devel \
+    nss \
+    mesa-libgbm \
+    ffmpeg \
+    libxext6 \
+    libssl-dev \
+    libcurl4-openssl-dev \
+    libpq-dev
+
+
+COPY --from=build /opt/chrome-linux /opt/chrome
+COPY --from=build /opt/chromedriver /opt/
+
+COPY poetry.lock pyproject.toml ./
+
+# Install Poetry, export dependencies to requirements.txt, and install dependencies
+# in the Lambda task directory, finally cleanup manifest files.
+RUN python3 -m pip install --upgrade pip && pip3 install poetry
+RUN poetry export -f requirements.txt > requirements.txt && \
+    pip3 install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
+    rm requirements.txt pyproject.toml poetry.lock
+
+# Optional TLS CA only if you plan to store the extracted data into Document DB
+RUN wget https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -P ${LAMBDA_TASK_ROOT}
+
+# Copy function code
+COPY . ${LAMBDA_TASK_ROOT}
+
+# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
+CMD ["main.handler"]
diff --git a/course/module1/Makefile b/course/module1/Makefile
@@ -0,0 +1,30 @@
+help:
+	@grep -E '^[a-zA-Z0-9 -]+:.*#'  Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done
+
+local-build: # Build lambda crawler on local.
+	docker buildx build --platform linux/amd64 -t crawler .
+
+local-deploy: # Deploy lambda crawler custom docker image on local.
+	docker run \
+		-p 9000:8080 \
+		--network llm-twin-course_local \
+		crawler:latest
+
+local-test: # Send test command on local to test  the lambda
+	curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
+	  	-d '{"user": "Paul Iuztin", "link": "https://medium.com/@pauliusztin/the-llms-kit-build-a-production-ready-real-time-financial-advisor-system-using-streaming-ffdcb2b50714"}'
+
+invoke:
+	aws lambda invoke \
+		--function-name crawler \
+		--cli-binary-format raw-in-base64-out \
+		--payload '{"user": "Paul Iuztin", "link": "https://github.com/iusztinpaul/hands-on-llms"}' \
+		response.json
+
+clean: # Cleanup files generated during sam building.
+	@echo "Cleaning old files..."
+	rm -rf /.pytest_cache
+	rm -rf /__pycache
+	rm -rf */.pyc
+	rm -rf .mypy_cache
+	@echo "Done."
diff --git a/course/module1/config.py b/course/module1/config.py
@@ -0,0 +1,20 @@
+from typing import Optional
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
+
+    # MongoDB
+    DATABASE_HOST: str = "mongodb://decodingml:decodingml@decodingml_mongo:27017"
+    DATABASE_NAME: str = "twin"
+
+    # LinkedIn Credentials
+    LINKEDIN_USERNAME: Optional[str] = "vladvlad814@yahoo.ro"
+    LINKEDIN_PASSWORD: Optional[str] = "uWMDdMWxv6aE"
+
+# LINKEDIN_USERNAME=vladvlad814@yahoo.ro
+# LINKEDIN_PASSWORD=uWMDdMWxv6aE
+
+settings = Settings()
diff --git a/course/module1/crawlers/__init__.py b/course/module1/crawlers/__init__.py
@@ -0,0 +1,3 @@
+from crawlers.github import GithubCrawler
+from crawlers.linkedin import LinkedInCrawler
+from crawlers.medium import MediumCrawler
diff --git a/course/module1/crawlers/base.py b/course/module1/crawlers/base.py
@@ -0,0 +1,63 @@
+import time
+from tempfile import mkdtemp
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+from documents import BaseDocument
+
+
+class BaseCrawler:
+
+    model: BaseDocument
+
+    def extract(self, link: str, **kwargs):
+        raise NotImplementedError("Needs implementation in subclass.")
+
+
+class BaseAbstractCrawler(BaseCrawler):
+
+    def __init__(self, scroll_limit: int = 5):
+        options = webdriver.ChromeOptions()
+        options.binary_location = '/opt/chrome/chrome'
+        options.add_argument('--no-sandbox')
+        options.add_argument('--headless=new')
+        options.add_argument('--single-process')
+        options.add_argument('--disable-dev-shm-usage')
+        options.add_argument('--disable-gpu')
+        options.add_argument('--log-level=3')
+        options.add_argument('--disable-popup-blocking')
+        options.add_argument('--disable-notifications')
+        options.add_argument('--disable-dev-tools')
+        options.add_argument('--ignore-certificate-errors')
+        options.add_argument("--no-zygote")
+        options.add_argument(f"--user-data-dir={mkdtemp()}")
+        options.add_argument(f"--data-path={mkdtemp()}")
+        options.add_argument(f"--disk-cache-dir={mkdtemp()}")
+        options.add_argument('--remote-debugging-port=9222')
+
+        self.set_extra_driver_options(options)
+
+        self.scroll_limit = scroll_limit
+        self.driver = webdriver.Chrome(
+            service=webdriver.ChromeService("/opt/chromedriver"), options=options
+        )
+
+    def set_extra_driver_options(self, options) -> Options:
+        pass
+
+    def login(self):
+        pass
+
+    def scroll_page(self):
+        """Scroll through the LinkedIn page based on the scroll limit."""
+        current_scroll = 0
+        last_height = self.driver.execute_script("return document.body.scrollHeight")
+        while True:
+            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            time.sleep(5)
+            new_height = self.driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit):
+                break
+            last_height = new_height
+            current_scroll += 1
diff --git a/module1/crawlers/github.py → course/module1/crawlers/github.py b/module1/crawlers/github.py → course/module1/crawlers/github.py
@@ -3,9 +3,13 @@
 import subprocess
 import tempfile
 
+from aws_lambda_powertools import Logger
+
 from crawlers.base import BaseCrawler
 from documents import RepositoryDocument
 
+logger = Logger(service="decodingml/crawler")
+
 
 class GithubCrawler(BaseCrawler):
 
@@ -46,28 +50,3 @@ def extract(self, link: str, **kwargs):
             raise
         finally:
             shutil.rmtree(local_temp)
-
-
-def handler(event, context):
-    # Extract the necessary information from the event object
-    link = os.getenv("repository_link")
-    user = os.getenv("user")
-
-    # Instantiate the GithubCrawler
-    crawler = GithubCrawler()
-
-    try:
-        # Use the crawler to extract data from the repository
-        crawler.extract(link=link, user=user)
-
-        return {"statusCode": 200, "body": "Repository processed successfully"}
-
-    except Exception as e:
-        # Handle exceptions
-        return {"statusCode": 500, "body": f"An error occurred: {str(e)}"}
-
-
-# Example of Usage
-if __name__ == "__main__":
-    crawler = GithubCrawler()
-    crawler.extract(link="git@github.com:decodingml/llm-twin-course.git", user="Alex")
diff --git a/module1/crawlers/linkedin.py → course/module1/crawlers/linkedin.py b/module1/crawlers/linkedin.py → course/module1/crawlers/linkedin.py
@@ -1,26 +1,25 @@
-import os
 import time
 from typing import Dict, List
 
+from aws_lambda_powertools import Logger
 from bs4 import BeautifulSoup
 from bs4.element import Tag
-from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 
 from config import settings
 from crawlers.base import BaseAbstractCrawler
 from documents import PostDocument
 from errors import ImproperlyConfigured
 
+logger = Logger(service="decodingml/crawler")
+
 
 class LinkedInCrawler(BaseAbstractCrawler):
 
     model = PostDocument
 
-    def set_driver_options(self) -> Options:
-        options = Options()
+    def set_extra_driver_options(self, options):
         options.add_experimental_option("detach", True)
-        return options
 
     def extract(self, link: str, **kwargs):
         print(f"Starting to scrape data for profile: {link}")
@@ -131,26 +130,3 @@ def login(self):
         self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click()
 
 
-def handler(event, context):
-    # Extract the necessary information from the event object
-    link = os.getenv("repository_link")
-    user = os.getenv("user")
-
-    # Instantiate the GithubCrawler
-    crawler = LinkedInCrawler()
-
-    try:
-        # Use the crawler to extract data from the repository
-        crawler.extract(link=link, user=user)
-
-        return {"statusCode": 200, "body": "Repository processed successfully"}
-
-    except Exception as e:
-        # Handle exceptions
-        return {"statusCode": 500, "body": f"An error occurred: {str(e)}"}
-
-
-if __name__ == "__main__":
-    crawler = LinkedInCrawler()
-    crawler.extract(link="https://www.linkedin.com/in/pauliusztin/", user="Alex")
-
diff --git a/course/module1/crawlers/medium.py b/course/module1/crawlers/medium.py
@@ -0,0 +1,40 @@
+from aws_lambda_powertools import Logger
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+
+from crawlers.base import BaseAbstractCrawler
+from documents import ArticleDocument
+
+logger = Logger(service="decodingml/crawler")
+
+
+class MediumCrawler(BaseAbstractCrawler):
+
+    model = ArticleDocument
+
+    def set_extra_driver_options(self, options):
+        options.add_argument(r"--profile-directory=Profile 2")
+
+    def extract(self, link: str, **kwargs):
+        self.driver.get(link)
+        self.scroll_page()
+
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+        title = soup.find_all("h1", class_="pw-post-title")
+        subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph")
+
+        data = {
+            "Title": title[0].string if title else None,
+            "Subtitle": subtitle[0].string if subtitle else None,
+            "Content": soup.get_text(),
+        }
+
+        logger.info(f"Successfully scraped and saved articles for user {link}")
+        self.driver.close()
+        instance = self.model(platform="medium", content=data, link=link, author_id=kwargs.get("user"))
+        instance.save()
+
+    def login(self):
+        """Log in to Medium with Google"""
+        self.driver.get("https://medium.com/m/signin")
+        self.driver.find_element(By.TAG_NAME, "a").click()
diff --git a/module1/data_pipeline/empty.txt → course/module1/data_pipeline/empty.txt b/module1/data_pipeline/empty.txt → course/module1/data_pipeline/empty.txt