fix counts_by_id sorting for pandas (#79)

Nixtla · Apr 19, 2024 · 22a670f · 22a670f
1 parent f85c87c
commit 22a670f
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 14 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -54,10 +54,9 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-          cache: 'pip'
 
       - name: Install the library
-        run: pip install ".[dev]"
+        run: pip install uv && uv pip install ".[dev]" --system
 
       - name: Run tests
         run: nbdev_test --do_print --timing --flags 'datasets matplotlib polars pyarrow scipy'

diff --git a/nbs/processing.ipynb b/nbs/processing.ipynb
@@ -126,10 +126,11 @@
     "#| export\n",
     "def counts_by_id(df: DataFrame, id_col: str) -> DataFrame:\n",
     "    if isinstance(df, pd.DataFrame):\n",
-    "        id_counts = df.groupby(id_col, observed=True).size()\n",
-    "        if not id_counts.index.is_monotonic_increasing:\n",
-    "            id_counts = id_counts.sort_index()\n",
-    "        id_counts = id_counts.reset_index()\n",
+    "        id_counts = df.groupby(id_col, observed=True).size().reset_index()\n",
+    "        # sort using numpy to prevent pandas from sorting categoricals by their codes\n",
+    "        # and ensuring a consistent sorting\n",
+    "        sort_idxs = id_counts[id_col].to_numpy().argsort()\n",
+    "        id_counts = id_counts.iloc[sort_idxs].reset_index(drop=True)\n",
     "    else:\n",
     "        id_counts = df[id_col].value_counts().sort(id_col)\n",
     "    id_counts.columns = [id_col, 'counts']\n",
@@ -173,7 +174,7 @@
     "    if ids_are_sorted and times_are_sorted:\n",
     "        return None\n",
     "    if isinstance(df, pd.DataFrame):\n",
-    "        sort_idxs = pd.MultiIndex.from_arrays([ids, times]).argsort()\n",
+    "        sort_idxs = np.lexsort((times, ids))\n",
     "    else:\n",
     "        sort_idxs = df.select(\n",
     "            pl.arg_sort_by([id_col, time_col])\n",

diff --git a/settings.ini b/settings.ini
@@ -1,7 +1,7 @@
 [DEFAULT]
 repo = utilsforecast
 lib_name = utilsforecast
-version = 0.1.4
+version = 0.1.5
 min_python = 3.8
 license = apache2
 black_formatting = True

diff --git a/utilsforecast/__init__.py b/utilsforecast/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.4"
+__version__ = "0.1.5"
diff --git a/utilsforecast/processing.py b/utilsforecast/processing.py
@@ -57,10 +57,11 @@ def to_numpy(df: DataFrame) -> np.ndarray:
 # %% ../nbs/processing.ipynb 7
 def counts_by_id(df: DataFrame, id_col: str) -> DataFrame:
     if isinstance(df, pd.DataFrame):
-        id_counts = df.groupby(id_col, observed=True).size()
-        if not id_counts.index.is_monotonic_increasing:
-            id_counts = id_counts.sort_index()
-        id_counts = id_counts.reset_index()
+        id_counts = df.groupby(id_col, observed=True).size().reset_index()
+        # sort using numpy to prevent pandas from sorting categoricals by their codes
+        # and ensuring a consistent sorting
+        sort_idxs = id_counts[id_col].to_numpy().argsort()
+        id_counts = id_counts.iloc[sort_idxs].reset_index(drop=True)
     else:
         id_counts = df[id_col].value_counts().sort(id_col)
     id_counts.columns = [id_col, "counts"]
@@ -96,7 +97,7 @@ def maybe_compute_sort_indices(
     if ids_are_sorted and times_are_sorted:
         return None
     if isinstance(df, pd.DataFrame):
-        sort_idxs = pd.MultiIndex.from_arrays([ids, times]).argsort()
+        sort_idxs = np.lexsort((times, ids))
     else:
         sort_idxs = (
             df.select(pl.arg_sort_by([id_col, time_col])).to_series(0).to_numpy()