Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify crawlers, refactor infrastructure and structure #1

Merged
merged 10 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions .github/workflows/crawler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: Crawlers

on: [pull_request]

jobs:
build:
name: Build & Push Docker Image
runs-on: ubuntu-latest
steps:
- name: Checkout Code
uses: actions/checkout@v2
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
mask-aws-account-id: 'false'
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
with:
mask-password: 'false'
- name: Build images & push to ECR
id: build-image
uses: docker/build-push-action@v4
with:
context: ./course/module1
file: ./course/module1/Dockerfile
tags: |
${{ steps.login-ecr.outputs.registry }}/crawler:${{ github.sha }}
${{ steps.login-ecr.outputs.registry }}/crawler:latest
push: true
outputs:
registry: ${{ steps.login-ecr.outputs.registry }}

deploy:
name: Deploy Crawler
runs-on: ubuntu-latest
needs: build
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Deploy Lambda Image
id: deploy-lambda
run: |
echo "Updating lambda with new image version $ECR_REPOSITORY/crawler:$PROJECT_VERSION..."
aws lambda update-function-code \
--function-name "arn:aws:lambda:$AWS_REGION:$AWS_ACCOUNT_ID:function:crawler" \
--image-uri $ECR_REPOSITORY/crawler:$PROJECT_VERSION
echo "Successfully updated lambda"
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
ECR_REPOSITORY: ${{ needs.build.outputs.registry }}
PROJECT_VERSION: ${{ github.sha }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ dmypy.json
# pytype static type analyzer
.pytype/

.idea

# Cython debug symbols
cython_debug/

Expand Down
File renamed without changes.
File renamed without changes.
63 changes: 63 additions & 0 deletions course/module1/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
FROM public.ecr.aws/lambda/python:3.11 as build

# Install chrome driver and browser
RUN yum install -y unzip && \
curl -Lo "/tmp/chromedriver.zip" "https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip" && \
curl -Lo "/tmp/chrome-linux.zip" "https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F1135561%2Fchrome-linux.zip?alt=media" && \
unzip /tmp/chromedriver.zip -d /opt/ && \
unzip /tmp/chrome-linux.zip -d /opt/

FROM public.ecr.aws/lambda/python:3.11

# Install the function's OS dependencies using yum
RUN yum install -y \
atk \
wget \
git \
cups-libs \
gtk3 \
libXcomposite \
alsa-lib \
libXcursor \
libXdamage \
libXext \
libXi \
libXrandr \
libXScrnSaver \
libXtst \
pango \
at-spi2-atk \
libXt \
xorg-x11-server-Xvfb \
xorg-x11-xauth \
dbus-glib \
dbus-glib-devel \
nss \
mesa-libgbm \
ffmpeg \
libxext6 \
libssl-dev \
libcurl4-openssl-dev \
libpq-dev


COPY --from=build /opt/chrome-linux /opt/chrome
COPY --from=build /opt/chromedriver /opt/

COPY poetry.lock pyproject.toml ./

# Install Poetry, export dependencies to requirements.txt, and install dependencies
# in the Lambda task directory, finally cleanup manifest files.
RUN python3 -m pip install --upgrade pip && pip3 install poetry
RUN poetry export -f requirements.txt > requirements.txt && \
pip3 install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
rm requirements.txt pyproject.toml poetry.lock

# Optional TLS CA only if you plan to store the extracted data into Document DB
RUN wget https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -P ${LAMBDA_TASK_ROOT}

# Copy function code
COPY . ${LAMBDA_TASK_ROOT}

# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
CMD ["main.handler"]
30 changes: 30 additions & 0 deletions course/module1/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
help:
@grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done

local-build: # Build lambda crawler on local.
docker buildx build --platform linux/amd64 -t crawler .

local-deploy: # Deploy lambda crawler custom docker image on local.
docker run \
-p 9000:8080 \
--network llm-twin-course_local \
crawler:latest

local-test: # Send test command on local to test the lambda
curl -X POST "http://localhost:9000/2015-03-31/functions/function/invocations" \
-d '{"user": "Paul Iuztin", "link": "https://medium.com/@pauliusztin/the-llms-kit-build-a-production-ready-real-time-financial-advisor-system-using-streaming-ffdcb2b50714"}'

invoke:
aws lambda invoke \
--function-name crawler \
--cli-binary-format raw-in-base64-out \
--payload '{"user": "Paul Iuztin", "link": "https://github.com/iusztinpaul/hands-on-llms"}' \
response.json

clean: # Cleanup files generated during sam building.
@echo "Cleaning old files..."
rm -rf /.pytest_cache
rm -rf /__pycache
rm -rf */.pyc
rm -rf .mypy_cache
@echo "Done."
20 changes: 20 additions & 0 deletions course/module1/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import Optional

from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")

# MongoDB
DATABASE_HOST: str = "mongodb://decodingml:decodingml@decodingml_mongo:27017"
DATABASE_NAME: str = "twin"

# LinkedIn Credentials
LINKEDIN_USERNAME: Optional[str] = "vladvlad814@yahoo.ro"
LINKEDIN_PASSWORD: Optional[str] = "uWMDdMWxv6aE"

# LINKEDIN_USERNAME=vladvlad814@yahoo.ro
# LINKEDIN_PASSWORD=uWMDdMWxv6aE

settings = Settings()
3 changes: 3 additions & 0 deletions course/module1/crawlers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from crawlers.github import GithubCrawler
from crawlers.linkedin import LinkedInCrawler
from crawlers.medium import MediumCrawler
63 changes: 63 additions & 0 deletions course/module1/crawlers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import time
from tempfile import mkdtemp

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from documents import BaseDocument


class BaseCrawler:

model: BaseDocument

def extract(self, link: str, **kwargs):
raise NotImplementedError("Needs implementation in subclass.")


class BaseAbstractCrawler(BaseCrawler):

def __init__(self, scroll_limit: int = 5):
options = webdriver.ChromeOptions()
options.binary_location = '/opt/chrome/chrome'
options.add_argument('--no-sandbox')
options.add_argument('--headless=new')
options.add_argument('--single-process')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--log-level=3')
options.add_argument('--disable-popup-blocking')
options.add_argument('--disable-notifications')
options.add_argument('--disable-dev-tools')
options.add_argument('--ignore-certificate-errors')
options.add_argument("--no-zygote")
options.add_argument(f"--user-data-dir={mkdtemp()}")
options.add_argument(f"--data-path={mkdtemp()}")
options.add_argument(f"--disk-cache-dir={mkdtemp()}")
options.add_argument('--remote-debugging-port=9222')

self.set_extra_driver_options(options)

self.scroll_limit = scroll_limit
self.driver = webdriver.Chrome(
service=webdriver.ChromeService("/opt/chromedriver"), options=options
)

def set_extra_driver_options(self, options) -> Options:
pass

def login(self):
pass

def scroll_page(self):
"""Scroll through the LinkedIn page based on the scroll limit."""
current_scroll = 0
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit):
break
last_height = new_height
current_scroll += 1
29 changes: 4 additions & 25 deletions module1/crawlers/github.py → course/module1/crawlers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
import subprocess
import tempfile

from aws_lambda_powertools import Logger

from crawlers.base import BaseCrawler
from documents import RepositoryDocument

logger = Logger(service="decodingml/crawler")


class GithubCrawler(BaseCrawler):

Expand Down Expand Up @@ -46,28 +50,3 @@ def extract(self, link: str, **kwargs):
raise
finally:
shutil.rmtree(local_temp)


def handler(event, context):
# Extract the necessary information from the event object
link = os.getenv("repository_link")
user = os.getenv("user")

# Instantiate the GithubCrawler
crawler = GithubCrawler()

try:
# Use the crawler to extract data from the repository
crawler.extract(link=link, user=user)

return {"statusCode": 200, "body": "Repository processed successfully"}

except Exception as e:
# Handle exceptions
return {"statusCode": 500, "body": f"An error occurred: {str(e)}"}


# Example of Usage
if __name__ == "__main__":
crawler = GithubCrawler()
crawler.extract(link="git@github.com:decodingml/llm-twin-course.git", user="Alex")
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
import os
import time
from typing import Dict, List

from aws_lambda_powertools import Logger
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

from config import settings
from crawlers.base import BaseAbstractCrawler
from documents import PostDocument
from errors import ImproperlyConfigured

logger = Logger(service="decodingml/crawler")


class LinkedInCrawler(BaseAbstractCrawler):

model = PostDocument

def set_driver_options(self) -> Options:
options = Options()
def set_extra_driver_options(self, options):
options.add_experimental_option("detach", True)
return options

def extract(self, link: str, **kwargs):
print(f"Starting to scrape data for profile: {link}")
Expand Down Expand Up @@ -131,26 +130,3 @@ def login(self):
self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click()


def handler(event, context):
# Extract the necessary information from the event object
link = os.getenv("repository_link")
user = os.getenv("user")

# Instantiate the GithubCrawler
crawler = LinkedInCrawler()

try:
# Use the crawler to extract data from the repository
crawler.extract(link=link, user=user)

return {"statusCode": 200, "body": "Repository processed successfully"}

except Exception as e:
# Handle exceptions
return {"statusCode": 500, "body": f"An error occurred: {str(e)}"}


if __name__ == "__main__":
crawler = LinkedInCrawler()
crawler.extract(link="https://www.linkedin.com/in/pauliusztin/", user="Alex")

40 changes: 40 additions & 0 deletions course/module1/crawlers/medium.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from aws_lambda_powertools import Logger
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By

from crawlers.base import BaseAbstractCrawler
from documents import ArticleDocument

logger = Logger(service="decodingml/crawler")


class MediumCrawler(BaseAbstractCrawler):

model = ArticleDocument

def set_extra_driver_options(self, options):
options.add_argument(r"--profile-directory=Profile 2")

def extract(self, link: str, **kwargs):
self.driver.get(link)
self.scroll_page()

soup = BeautifulSoup(self.driver.page_source, "html.parser")
title = soup.find_all("h1", class_="pw-post-title")
subtitle = soup.find_all("h2", class_="pw-subtitle-paragraph")

data = {
"Title": title[0].string if title else None,
"Subtitle": subtitle[0].string if subtitle else None,
"Content": soup.get_text(),
}

logger.info(f"Successfully scraped and saved articles for user {link}")
self.driver.close()
instance = self.model(platform="medium", content=data, link=link, author_id=kwargs.get("user"))
instance.save()

def login(self):
"""Log in to Medium with Google"""
self.driver.get("https://medium.com/m/signin")
self.driver.find_element(By.TAG_NAME, "a").click()
File renamed without changes.
Loading
Loading