Skip to content

Commit

Permalink
Merge pull request #28 from lincc-frameworks/count_nested
Browse files Browse the repository at this point in the history
implement count_nested
  • Loading branch information
dougbrn authored Apr 12, 2024
2 parents d63cee9 + 48c83de commit 39acb91
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/nested_pandas/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .utils import * # noqa
38 changes: 38 additions & 0 deletions src/nested_pandas/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd

from nested_pandas import NestedFrame


def count_nested(df, nested, by=None, join=True) -> NestedFrame:
"""Counts the number of rows of a nested dataframe.
Parameters
----------
df: NestedFrame
A NestedFrame that contains the desired `nested` series
to count.
nested: 'str'
The label of the nested series to count.
by: 'str', optional
Specifies a column within nested to count by, returning
a count for each unique value in `by`.
join: bool, optional
Join the output count columns to df and return df, otherwise
just return a NestedFrame containing only the count columns.
Returns
-------
NestedFrame
"""

if by is None:
counts = df["nested"].apply(lambda x: len(x)).rename(f"n_{nested}")
else:
counts = df["nested"].apply(lambda x: x[by].value_counts())
counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns})
if join:
return df.join(counts)
# else just return the counts NestedFrame
if isinstance(counts, pd.Series): # for by=None, which returns a Series
counts = NestedFrame(counts.to_frame())
return counts
39 changes: 39 additions & 0 deletions tests/nested_pandas/utils/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import numpy as np
import pandas as pd
import pytest
from nested_pandas import NestedFrame
from nested_pandas.utils import count_nested


@pytest.mark.parametrize("join", [True, False])
def test_count_nested(join):
"""Test the functionality of count nested"""

# Initialize test data
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={
"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1],
"d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
"label": ["a", "a", "b", "b", "a", "a", "b", "a", "b"],
},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
base = base.add_nested(nested, "nested")

# Test general count
total_counts = count_nested(base, "nested", join=join)
assert all(total_counts["n_nested"].values == 3)

# Test count by
label_counts = count_nested(base, "nested", by="label", join=join)
assert all(label_counts["n_nested_a"].values == [2, 2, 1])
assert all(label_counts["n_nested_b"].values == [1, 1, 2])

# Test join behavior
if join:
assert total_counts.columns.tolist() == base.columns.tolist() + ["n_nested"]
assert label_counts.columns.tolist() == base.columns.tolist() + ["n_nested_a", "n_nested_b"]
else:
assert total_counts.columns.tolist() == ["n_nested"]
assert label_counts.columns.tolist() == ["n_nested_a", "n_nested_b"]

0 comments on commit 39acb91

Please sign in to comment.