Initial commit

reglab · Sep 6, 2024 · b69a6c4 · b69a6c4
commit b69a6c4
Show file tree

Hide file tree

Showing 10 changed files with 827 additions and 0 deletions.
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -0,0 +1,16 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - uses: pre-commit/action@v3.0.0
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,15 @@
+.idea/
+.vscode/
+
+data/
+.env
+
+.venv/
+venv/
+node_modules/
+
+.DS_Store
+.ipynb_checkpoints/
+__pycache__/
+*.egg-info
+*.swp
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,36 @@
+default_language_version:
+  python: python3.10
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.4.7
+    hooks:
+      - id: ruff
+        types_or: [python, pyi, jupyter]
+        args: [--fix]
+      - id: ruff-format
+        types_or: [python, pyi, jupyter]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/roy-ht/pre-commit-jupyter
+    rev: v1.2.1
+    hooks:
+      - id: jupyter-notebook-cleanup
+        args:
+          # - --remove-kernel-metadata
+          - --pin-patterns
+          - "[pin];[donotremove]"
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v3.1.0
+    hooks:
+      - id: prettier
+        types_or:
+          - javascript
+          - ts
+          - tsx
+          - yaml
+          - css
diff --git a/README.md b/README.md
@@ -0,0 +1,86 @@
+# Better Python Code
+
+MyWord is a small demonstration project that shows how you can write better research code in Python.
+
+## Setup
+
+This project has been tested on Python 3.10, but earlier or later versions may work as well.
+First, clone this repo:
+
+```bash
+git clone https://github.com/reglab/better-python.git myword
+```
+
+Next, install uv if you haven't already:
+
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+
+Then, create a virtual environment:
+
+```bash
+cd myword
+uv venv  # optionally add --python 3.11 or another version
+```
+
+To activate the virtual environment:
+
+```bash
+source .venv/bin/activate # If using fish shell, use `source .venv/bin/activate.fish` instead
+```
+
+Generally, you will want to activate the virtual environment before running any of the scripts
+in this project. However, if you use `uv`, it can handle everything for you! Just run code with
+`uv run python <script.py>`. (You can also run anything else installed to your venv, e.g.,
+`uv run ruff check --fix`.)
+
+Using `uv run` will also automatically sync your environment with any new/removed packages
+in your `pyproject.toml` file. (See the Development section below, or the `uv` docs, for
+information about installing/maintaining dependencies.) You can read the `uv` documentation
+[here](https://docs.astral.sh/uv/getting-started/features/#projects).
+
+Then, install the dependencies and this package:
+
+```bash
+brew install tesseract  # If not on Mac, look up how to install Tesseract for your OS
+
+uv sync
+```
+
+Finally, install the git hooks:
+
+```bash
+# If you don't already have pre-commit, run: `uv tool install pre-commit`
+pre-commit install
+```
+
+## Usage
+
+To run the download script:
+
+```bash
+python myword/download_data.py
+```
+
+To run the "train" script:
+
+```bash
+python myword/train_word2vec.py
+```
+
+With each of these, you can add the `--help` flag to see the available options.
+
+
+## Development
+
+To add/remove dependencies later on:
+```bash
+uv add <package>  # or uv remove <package>
+uv add --dev <package>  # For dev dependencies, like pre-commit
+```
+
+## Activity
+
+The `main` branch contains the "final" version of the code, while the `initial` branch
+contains the code before the caching optimization.
diff --git a/myword/__init__.py b/myword/__init__.py
diff --git a/myword/download_data.py b/myword/download_data.py
@@ -0,0 +1,38 @@
+import zipfile
+from pathlib import Path
+
+import fire
+import gdown
+
+import myword.utils
+
+_DEFAULT_ZIP_URL = "https://drive.google.com/file/d/1ho19xLFmgKWWi0xCoKEPV_maKwrhl51k/view?usp=drive_link"
+_DEFAULT_OUTPUT_DIR = myword.utils.get_data_path() / "pdf"
+
+
+def download_gdrive_folder(
+    zip_url: str = _DEFAULT_ZIP_URL,
+    output_dir: str = _DEFAULT_OUTPUT_DIR,
+) -> None:
+    """Download a zip file from Google Drive and extract it to a local directory.
+
+    :param zip_url: URL to the folder to download.
+    :param output_dir: Local directorty to save the folder to.
+    """
+    output_dir = Path(output_dir)
+    # Name the zip file after the output directory and place it in the same parent directory.
+    zip_output_path = output_dir.parent / f"{output_dir.name}.zip"
+    gdown.download(zip_url, str(zip_output_path), quiet=False, fuzzy=True)
+
+    # Unzip the folder
+    print(f"Unzipping {zip_output_path} to {output_dir}...")
+    with zipfile.ZipFile(zip_output_path, "r") as zip_ref:
+        # We extract to the output's parent directory to avoid creating a redundant nested directory.
+        zip_ref.extractall(output_dir.parent)
+    # Delete the zip file
+    print(f"Extraction completed. Deleting {zip_output_path}...")
+    zip_output_path.unlink()
+
+
+if __name__ == "__main__":
+    fire.Fire(download_gdrive_folder)
diff --git a/myword/train_word2vec.py b/myword/train_word2vec.py
@@ -0,0 +1,76 @@
+"""Train word2vec model from a directory of PDFs"""
+
+import time
+from pathlib import Path
+
+import fire
+import pdf2image
+import pytesseract
+import tqdm
+from diskcache import Cache
+
+import myword.utils
+
+# This translates to the `data/pdf` directory in the repository root.
+PDF_DIR = myword.utils.get_data_path() / "pdf"
+
+pdf_text_cache = Cache(str(myword.utils.get_data_path() / "cache" / "pdf_text"))
+
+
+def get_text_windows_from_pdf(pdf_path: Path, window_size) -> list[list[str]]:
+    if pdf_path in pdf_text_cache:
+        text = pdf_text_cache[pdf_path]
+    else:
+        images = pdf2image.convert_from_path(pdf_path, dpi=200)
+        page_texts = [pytesseract.image_to_string(img) for img in images]
+        text = "\n".join(page_texts)
+        pdf_text_cache[pdf_path] = text
+
+    words = text.lower().split()
+    windows = [words[i : i + window_size] for i in range(len(words) - window_size + 1)]
+    return windows
+
+
+def train_model(
+    word_windows: list[list[str]],
+    output_path: str = "word2vec.model",
+    *,
+    vector_dim: int = 300,
+    num_epochs: int = 10,
+) -> None:
+    # A dummy function to simulate training a word2vec model.
+    for _ in tqdm.trange(num_epochs, desc="Epochs"):
+        time.sleep(len(word_windows) * vector_dim * 1e-7)
+    print(f"Training complete. Saved model to {output_path}.")
+
+
+def main(
+    pdf_dir: str = str(PDF_DIR),
+    output_path: str = "word2vec.model",
+    vector_dim: int = 300,
+    window_size: int = 5,
+    num_epochs: int = 10,
+) -> None:
+    pdf_dir = Path(pdf_dir)
+    pdf_files = list(pdf_dir.glob("*.pdf"))
+    print(f"Found {len(pdf_files)} PDFs in {pdf_dir}.")
+    word_windows = []
+    for pdf_file in tqdm.tqdm(pdf_files, desc="PDFs"):
+        word_windows.extend(
+            get_text_windows_from_pdf(pdf_file, window_size=window_size),
+        )
+    print(f"Training word2vec model with {len(word_windows)} word windows...")
+    train_model(
+        word_windows,
+        output_path=output_path,
+        vector_dim=vector_dim,
+        num_epochs=num_epochs,
+    )
+
+
+def entrypoint():
+    fire.Fire(main)
+
+
+if __name__ == "__main__":
+    entrypoint()
diff --git a/myword/utils.py b/myword/utils.py
@@ -0,0 +1,5 @@
+from pathlib import Path
+
+
+def get_data_path() -> Path:
+    return Path(__file__).parent.parent / "data"
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,51 @@
+[project]
+name = "myword"
+version = "0.1.0"
+authors = [
+    { name = "Faiz Surani", email = "faiz@law.stanford.edu" },
+]
+description = "A demo project for writing better research code in Python."
+readme = "README.md"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+requires-python = ">=3.10"
+dependencies = [
+    "diskcache",
+    "fire",
+    "gdown",
+    "pdf2image",
+    "pillow",
+    "pytesseract",
+    "tqdm",
+]
+
+[tool.uv]
+dev-dependencies = [
+    "pre-commit",
+    "ruff",
+    "snakeviz",
+]
+
+[project.scripts]
+train_vectors = "myword.train_word2vec:entrypoint"
+
+[project.urls]
+homepage = "https://github.com/reglab/better-python"
+
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+packages = ["myword"]
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint]
+extend-select = ["B", "E", "I", "N", "PTH", "COM", "C4", "UP"]
+extend-ignore = ["COM812", "E712", "E501"]