Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Pivot table #6075

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions py/server/deephaven/experimental/pivot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#
# Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
#

"""This module defines functions for creating pivot tables."""

from typing import Sequence, Union, Optional, Callable, Any
import re
from deephaven import DHError, empty_table
from deephaven.table import Table, PartitionedTable, multi_join
from deephaven.numpy import to_numpy
from deephaven.update_graph import auto_locking_ctx
from deephaven.jcompat import to_sequence


def _is_legal_column(s: str) -> bool:
"""Check if a column name is legal.

Args:
s (str): The column name to check.

Returns:
bool: True if the column name is legal, False otherwise.
"""
return re.match("^[_a-zA-Z][_a-zA-Z0-9]*$", s) is not None


def _legalize_column(s: str) -> str:
"""Legalize a column name. Invalid characters are replaced with underscores.
The legalized column name is not guaranteed to be unique.

Args:
s (str): The column name to legalize.

Returns:
str: The legalized column name.

Raises:
ValueError: If the column name is empty.
"""
if re.match("^[_a-zA-Z][_a-zA-Z0-9]*$", s):
return s
if re.match("^[_a-zA-Z].*$", s):
return re.sub("[^_a-zA-Z0-9]", "_", s)
return "_" + re.sub("[^_a-zA-Z0-9]", "_", s)


def pivot(table: Table, row_cols: Union[str, Sequence[str]], column_col: str, value_col: str,
value_to_col_name: Optional[Callable[[Any], str]] = None) -> Table:
Comment on lines +48 to +49
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would assume for true pivot tables, customers will ask to be able to pass in an array for columns and values as well, similar to Excel: https://support.microsoft.com/en-us/office/pivotby-function-de86516a-90ad-4ced-8522-3a25fac389cf

""" Create a pivot table from the input table.

NOTE: The schema of the pivot table is frozen at the time of creation. As a result, columns in the output table
will not change after the pivot table is created. If the input table changes, the pivot table may not reflect the
changes.

Args:
table (Table): The input table.
row_cols (Union[str, Sequence[str]]): The row columns in the input table.
column_col (str): The column column in the input table.
value_col (str): The value column in the input table.
value_to_col_name (Optional[Callable[[Any],str]]): A function that converts a value to a column name.
The function should return a string that is a valid column name.
If None (default), a string representation of the value is used as the column name, with invalid
characters replaced by underscores. The character replacement is not guaranteed to produce unique
column names.
Comment on lines +61 to +65
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think confusingly worded... it's called value_to_col_name, and says it converts a "value to a column name", which in your example I would expect to be the sum_by value (e.g. values in the Sentinel column); but it's actually receiving the value from within the column_col.


Returns:
Table: The pivot table.

Raises:
ValueError: If the input table is empty.
DHError: If an error occurs while creating the pivot table.
"""
row_cols = list(to_sequence(row_cols))
ptable = table.partition_by(column_col)

if not value_to_col_name:
value_to_col_name = lambda x: _legalize_column(str(x))

# Locking to ensure that the partitioned table doesn't change while creating the query
with auto_locking_ctx(ptable):
# TODO: this does not handle key changes in the constituent tables. It should.
keys = ptable.keys()
key_values = to_numpy(table=keys, cols=[column_col])

if len(key_values) == 0:
return empty_table(0)

tables = []
col_names = set()

for key, con in zip(key_values, ptable.constituent_tables):
col_name = value_to_col_name(key[0])

if not isinstance(col_name, str):
raise DHError(
f"Value does not map to a string: value={key[0]} col_name={col_name} col_type={type(col_name)}")

if not _is_legal_column(col_name):
raise DHError(f"Value maps to an invalid column name: value={key[0]} col_name={col_name}")

if col_name in col_names:
raise DHError(f"Value maps to a duplicate column name: value={key[0]} col_name={col_name}")

col_names.add(col_name)
tables.append(con.view(row_cols + [f"{col_name}={value_col}"]))

return multi_join(input=tables, on=row_cols).table()
99 changes: 99 additions & 0 deletions py/server/tests/test_pivot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#
# Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
#

from tests.testbase import BaseTestCase
from deephaven import DHError, new_table
from deephaven.column import int_col
from deephaven.experimental.pivot import pivot
from deephaven.table import table_diff


class PivotTestCase(BaseTestCase):

def test_pivot_1_row(self):
input = new_table([
int_col("Row", [1, 2, 3, 1, 2, 3]),
int_col("Col", [1, 1, 1, 2, 2, 2]),
int_col("Value", [10, 20, 30, 40, 50, 60])
])

with self.subTest("pivot - 1 row col"):
p = pivot(input, "Row", "Col", "Value")

target = new_table([
int_col("Row", [1, 2, 3]),
int_col("_1", [10, 20, 30]),
int_col("_2", [40, 50, 60]),
])

d = table_diff(p, target)
self.assertEqual(d, "")

with self.subTest("pivot - 1 row col - value_to_col_name"):
p = pivot(input, "Row", "Col", "Value", value_to_col_name=lambda x: f"Col_{x}")

target = new_table([
int_col("Row", [1, 2, 3]),
int_col("Col_1", [10, 20, 30]),
int_col("Col_2", [40, 50, 60]),
])

d = table_diff(p, target)
self.assertEqual(d, "")

def test_pivot_2_row(self):
input = new_table([
int_col("Row1", [1, 2, 2, 1, 2, 2]),
int_col("Row2", [1, 2, 3, 1, 2, 3]),
int_col("Col", [1, 1, 1, 2, 2, 2]),
int_col("Value", [10, 20, 30, 40, 50, 60])
])

with self.subTest("pivot - 2 row col"):
p = pivot(input, ["Row1", "Row2"], "Col", "Value")

target = new_table([
int_col("Row1", [1, 2, 2]),
int_col("Row2", [1, 2, 3]),
int_col("_1", [10, 20, 30]),
int_col("_2", [40, 50, 60]),
])

d = table_diff(p, target)
self.assertEqual(d, "")

with self.subTest("pivot - 2 row col - value_to_col_name"):
p = pivot(input, ["Row1", "Row2"], "Col", "Value", value_to_col_name=lambda x: f"Col_{x}")

target = new_table([
int_col("Row1", [1, 2, 2]),
int_col("Row2", [1, 2, 3]),
int_col("Col_1", [10, 20, 30]),
int_col("Col_2", [40, 50, 60]),
])

d = table_diff(p, target)
self.assertEqual(d, "")

def test_pivot_errors(self):
input = new_table([
int_col("Row", [1, 2, 3, 1, 2, 3]),
int_col("Col", [1, 1, 1, 2, 2, 2]),
int_col("Value", [10, 20, 30, 40, 50, 60])
])

with self.subTest("pivot - non-string col_name"):
with self.assertRaises(DHError) as cm:
p = pivot(input, "Row", "Col", "Value", value_to_col_name=lambda x: 1.0)
self.assertIn("Value does not map to a string", str(cm.exception))

with self.subTest("pivot - invalid col_name"):
with self.assertRaises(DHError) as cm:
p = pivot(input, "Row", "Col", "Value", value_to_col_name=lambda x: f".{x}#")
self.assertIn("Value maps to an invalid column name", str(cm.exception))

with self.subTest("pivot - duplicate col_name"):
with self.assertRaises(DHError) as cm:
p = pivot(input, "Row", "Col", "Value", value_to_col_name=lambda x: "Col")
self.assertIn("Value maps to a duplicate column name", str(cm.exception))