Skip to content

Commit

Permalink
fix counts_by_id sorting for pandas (#79)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez authored Apr 19, 2024
1 parent f85c87c commit 22a670f
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 14 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,9 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'

- name: Install the library
run: pip install ".[dev]"
run: pip install uv && uv pip install ".[dev]" --system

- name: Run tests
run: nbdev_test --do_print --timing --flags 'datasets matplotlib polars pyarrow scipy'
Expand Down
11 changes: 6 additions & 5 deletions nbs/processing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,11 @@
"#| export\n",
"def counts_by_id(df: DataFrame, id_col: str) -> DataFrame:\n",
" if isinstance(df, pd.DataFrame):\n",
" id_counts = df.groupby(id_col, observed=True).size()\n",
" if not id_counts.index.is_monotonic_increasing:\n",
" id_counts = id_counts.sort_index()\n",
" id_counts = id_counts.reset_index()\n",
" id_counts = df.groupby(id_col, observed=True).size().reset_index()\n",
" # sort using numpy to prevent pandas from sorting categoricals by their codes\n",
" # and ensuring a consistent sorting\n",
" sort_idxs = id_counts[id_col].to_numpy().argsort()\n",
" id_counts = id_counts.iloc[sort_idxs].reset_index(drop=True)\n",
" else:\n",
" id_counts = df[id_col].value_counts().sort(id_col)\n",
" id_counts.columns = [id_col, 'counts']\n",
Expand Down Expand Up @@ -173,7 +174,7 @@
" if ids_are_sorted and times_are_sorted:\n",
" return None\n",
" if isinstance(df, pd.DataFrame):\n",
" sort_idxs = pd.MultiIndex.from_arrays([ids, times]).argsort()\n",
" sort_idxs = np.lexsort((times, ids))\n",
" else:\n",
" sort_idxs = df.select(\n",
" pl.arg_sort_by([id_col, time_col])\n",
Expand Down
2 changes: 1 addition & 1 deletion settings.ini
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[DEFAULT]
repo = utilsforecast
lib_name = utilsforecast
version = 0.1.4
version = 0.1.5
min_python = 3.8
license = apache2
black_formatting = True
Expand Down
2 changes: 1 addition & 1 deletion utilsforecast/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.4"
__version__ = "0.1.5"
11 changes: 6 additions & 5 deletions utilsforecast/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ def to_numpy(df: DataFrame) -> np.ndarray:
# %% ../nbs/processing.ipynb 7
def counts_by_id(df: DataFrame, id_col: str) -> DataFrame:
if isinstance(df, pd.DataFrame):
id_counts = df.groupby(id_col, observed=True).size()
if not id_counts.index.is_monotonic_increasing:
id_counts = id_counts.sort_index()
id_counts = id_counts.reset_index()
id_counts = df.groupby(id_col, observed=True).size().reset_index()
# sort using numpy to prevent pandas from sorting categoricals by their codes
# and ensuring a consistent sorting
sort_idxs = id_counts[id_col].to_numpy().argsort()
id_counts = id_counts.iloc[sort_idxs].reset_index(drop=True)
else:
id_counts = df[id_col].value_counts().sort(id_col)
id_counts.columns = [id_col, "counts"]
Expand Down Expand Up @@ -96,7 +97,7 @@ def maybe_compute_sort_indices(
if ids_are_sorted and times_are_sorted:
return None
if isinstance(df, pd.DataFrame):
sort_idxs = pd.MultiIndex.from_arrays([ids, times]).argsort()
sort_idxs = np.lexsort((times, ids))
else:
sort_idxs = (
df.select(pl.arg_sort_by([id_col, time_col])).to_series(0).to_numpy()
Expand Down

0 comments on commit 22a670f

Please sign in to comment.