Skip to content

Commit

Permalink
feat: support dask and spark dataframes in evaluate (#121)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez authored Sep 13, 2024
1 parent 2d016ed commit 5dbd994
Show file tree
Hide file tree
Showing 12 changed files with 435 additions and 74 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
set -ux
python -m pip install --upgrade pip
pip install -Uq nbdev
pip install -e ".[dev]"
pip install ".[dev]" fugue[dask,spark]>=0.8.1
mkdir nbs/_extensions
cp -r docs-scripts/mintlify/ nbs/_extensions/
python docs-scripts/update-quarto.py
Expand Down
35 changes: 13 additions & 22 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,56 +7,48 @@ on:
branches: [main]
workflow_dispatch:

defaults:
run:
shell: bash -l {0}

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
all-tests:
runs-on: ${{ matrix.os }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
os: [macos-latest, ubuntu-latest]
python-version: ['3.8', '3.9', '3.10', '3.11']
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
steps:
- name: Clone repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

- name: Set up environment
uses: mamba-org/setup-micromamba@f8b8a1e23a26f60a44c853292711bacfd3eac822 # v1.9.0
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
environment-file: environment.yml
create-args: python=${{ matrix.python-version }}
cache-environment: true
python-version: ${{ matrix.python-version }}

- name: Install the library
run: pip install ./
run: pip install uv && uv pip install --system ".[dev]" fugue[dask,spark]>=0.8.1

- name: Run tests
run: nbdev_test --do_print --timing --flags 'matplotlib polars pyarrow scipy'
run: nbdev_test --do_print --timing --flags 'datasets distributed matplotlib polars pyarrow scipy'

windows-tests:
runs-on: windows-latest
local-tests:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11']
os: [macos-latest, windows-latest]
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
steps:
- name: Clone repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

- name: Set up environment
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}

- name: Install the library
run: pip install uv && uv pip install ".[dev]" --system
run: pip install uv && uv pip install --system ".[dev]"

- name: Run tests
run: nbdev_test --do_print --timing --flags 'datasets matplotlib polars pyarrow scipy'
Expand All @@ -75,11 +67,10 @@ jobs:
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: '3.10'
cache: 'pip'

- name: Install dependencies
shell: bash
run: pip3 install . nbdev
run: pip install . nbdev

- name: Run tests
shell: bash
Expand Down
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ repos:
hooks:
- id: mypy
args: [--ignore-missing-imports]
exclude: 'setup.py'
1 change: 1 addition & 0 deletions action_files/clean_nbs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
nbdev_clean
./action_files/remove_logs_cells
30 changes: 30 additions & 0 deletions action_files/remove_logs_cells
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
import re
from pathlib import Path
from nbdev.clean import process_write

IP_REGEX = re.compile(r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}')
HOURS_REGEX = re.compile(r'\d{2}:\d{2}:\d{2}')

def cell_contains_ips(cell):
if 'outputs' not in cell:
return False
for output in cell['outputs']:
if 'text' not in output:
return False
for line in output['text']:
if IP_REGEX.search(line) or HOURS_REGEX.search(line) or '[LightGBM]' in line:
return True
return False


def clean_nb(nb):
for cell in nb['cells']:
if cell_contains_ips(cell):
cell['outputs'] = []


if __name__ == '__main__':
repo_root = Path(__file__).parents[1]
for nb in (repo_root / 'nbs').glob('*.ipynb'):
process_write(warn_msg='Failed to clean_nb', proc_nb=clean_nb, f_in=nb)
24 changes: 23 additions & 1 deletion nbs/compat.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,30 @@
" return f(*args, **kwargs)\n",
" return wrapper\n",
"\n",
"try:\n",
" from dask.dataframe import DataFrame as DaskDataFrame\n",
"except ModuleNotFoundError:\n",
" pass\n",
"\n",
"try:\n",
" from pyspark.sql import DataFrame as SparkDataFrame\n",
"except ModuleNotFoundError:\n",
" pass\n",
"\n",
"DataFrame = Union[pd.DataFrame, pl_DataFrame]\n",
"Series = Union[pd.Series, pl_Series]"
"Series = Union[pd.Series, pl_Series]\n",
"DistributedDFType = TypeVar(\n",
" \"DistributedDFType\",\n",
" \"DaskDataFrame\",\n",
" \"SparkDataFrame\",\n",
")\n",
"AnyDFType = TypeVar(\n",
" \"AnyDFType\",\n",
" \"DaskDataFrame\",\n",
" pd.DataFrame,\n",
" \"pl_DataFrame\",\n",
" \"SparkDataFrame\",\n",
")"
]
}
],
Expand Down
Loading

0 comments on commit 5dbd994

Please sign in to comment.