Skip to content

Commit

Permalink
updated in response to PR feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
stress-tess committed Sep 18, 2024
1 parent 5eee07a commit fc5f4ea
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 16 deletions.
8 changes: 4 additions & 4 deletions arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class DataFrameGroupBy:
If True the grouped values of the aggregation keys will be treated as an index.
"""

def __init__(self, gb, df, gb_key_names=None, as_index=True, dropna=True):
def __init__(self, gb, df, gb_key_names=None, as_index=True):

self.gb = gb
self.df = df
Expand All @@ -115,11 +115,11 @@ def __init__(self, gb, df, gb_key_names=None, as_index=True, dropna=True):
for attr in ["nkeys", "permutation", "unique_keys", "segments"]:
setattr(self, attr, getattr(gb, attr))

self.dropna = dropna
self.dropna = self.gb.dropna
self.where_not_nan = None
self.all_non_nan = False

if dropna:
if self.dropna:
from arkouda import all as ak_all
from arkouda import isnan

Expand Down Expand Up @@ -2714,7 +2714,7 @@ def GroupBy(self, keys, use_series=False, as_index=True, dropna=True):

gb = akGroupBy(cols, dropna=dropna)
if use_series:
gb = DataFrameGroupBy(gb, self, gb_key_names=keys, as_index=as_index, dropna=dropna)
gb = DataFrameGroupBy(gb, self, gb_key_names=keys, as_index=as_index)
return gb

def memory_usage(self, index=True, unit="B") -> Series:
Expand Down
26 changes: 14 additions & 12 deletions tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,16 +650,17 @@ def test_gb_aggregations_example_numeric_types(self, agg):
pd_result = getattr(pd_df.groupby(group_on), agg)()
assert_frame_equal(ak_result.to_pandas(retain_index=True), pd_result)

@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize("agg", ["count", "max", "mean", "median", "min", "std", "sum", "var"])
def test_gb_aggregations_with_nans(self, agg):
def test_gb_aggregations_with_nans(self, agg, dropna):
df = self.build_ak_df_with_nans()
# @TODO handle bool columns correctly
df.drop("bools", axis=1, inplace=True)
pd_df = df.to_pandas()

group_on = ["key1", "key2"]
ak_result = getattr(df.groupby(group_on), agg)()
pd_result = getattr(pd_df.groupby(group_on, as_index=False), agg)()
ak_result = getattr(df.groupby(group_on, dropna=dropna), agg)()
pd_result = getattr(pd_df.groupby(group_on, as_index=False, dropna=dropna), agg)()
assert_frame_equal(ak_result.to_pandas(retain_index=True), pd_result)

# TODO aggregations of string columns not currently supported (even for count)
Expand All @@ -668,27 +669,28 @@ def test_gb_aggregations_with_nans(self, agg):
pd_df = df.to_pandas()

group_on = ["nums1", "nums2"]
ak_result = getattr(df.groupby(group_on), agg)()
pd_result = getattr(pd_df.groupby(group_on, as_index=False), agg)()
ak_result = getattr(df.groupby(group_on, dropna=dropna), agg)()
pd_result = getattr(pd_df.groupby(group_on, as_index=False, dropna=dropna), agg)()
assert_frame_equal(ak_result.to_pandas(retain_index=True), pd_result)

# TODO aggregation mishandling NaN see issue #3765
df.drop("nums2", axis=1, inplace=True)
pd_df = df.to_pandas()
group_on = "nums1"
ak_result = getattr(df.groupby(group_on), agg)()
pd_result = getattr(pd_df.groupby(group_on), agg)()
ak_result = getattr(df.groupby(group_on, dropna=dropna), agg)()
pd_result = getattr(pd_df.groupby(group_on, dropna=dropna), agg)()
assert_frame_equal(ak_result.to_pandas(retain_index=True), pd_result)

def test_count_nan_bug(self):
@pytest.mark.parametrize("dropna", [True, False])
def test_count_nan_bug(self, dropna):
# verify reproducer for #3762 is fixed
df = ak.DataFrame({"A": [1, 2, 2, np.nan], "B": [3, 4, 5, 6], "C": [1, np.nan, 2, 3]})
ak_result = df.groupby("A").count()
pd_result = df.to_pandas().groupby("A").count()
ak_result = df.groupby("A", dropna=dropna).count()
pd_result = df.to_pandas().groupby("A", dropna=dropna).count()
assert_frame_equal(ak_result.to_pandas(retain_index=True), pd_result)

ak_result = df.groupby(["A", "C"], as_index=False).count()
pd_result = df.to_pandas().groupby(["A", "C"], as_index=False).count()
ak_result = df.groupby(["A", "C"], as_index=False, dropna=dropna).count()
pd_result = df.to_pandas().groupby(["A", "C"], as_index=False, dropna=dropna).count()
assert_frame_equal(ak_result.to_pandas(retain_index=True), pd_result)

def test_gb_aggregations_return_dataframe(self):
Expand Down

0 comments on commit fc5f4ea

Please sign in to comment.