Skip to content

Commit

Permalink
Merge pull request #160 from realratchet/master
Browse files Browse the repository at this point in the history
Groupby api update
  • Loading branch information
realratchet authored Apr 5, 2024
2 parents 222d900 + 121af91 commit 27f39c6
Show file tree
Hide file tree
Showing 6 changed files with 345 additions and 134 deletions.
137 changes: 34 additions & 103 deletions nimlite/funcs/groupby.nim
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,27 @@ import std/[math, tables, strutils, strformat, sequtils, enumerate, sugar, optio
import ../[pytypes, numpy, pymodules, nimpyext]
import ./imputation

type Accumulator = enum
type Accumulator* = enum
Max, Min, Sum, First, Last, Product, Count,
CountUnique, Average, StandardDeviation, Median, Mode

proc str2Accumulator*(str: string): Accumulator =
let lower = str.toLower()
case lower:
of "max": result = Max
of "min": result = Min
of "sum": result = Sum
of "first": result = First
of "last": result = Last
of "product": result = Product
of "count": result = Count
of "count_unique": result = CountUnique
of "avg": result = Average
of "stdev": result = StandardDeviation
of "median": result = Median
of "mode": result = Mode
else:
raise newException(ValueError, &"Unrecognized groupby accumulator - {str}.")
return (
case str.toLower():
of "max": Max
of "min": Min
of "sum": Sum
of "first": First
of "last": Last
of "product": Product
of "count": Count
of "countunique", "count_unique": CountUnique
of "avg", "average": Average
of "stdev", "standarddeviation": StandardDeviation
of "median": Median
of "mode": Mode
else: raise newException(ValueError, &"Unrecognized groupby accumulator - {str}.")
)

# =============================================================
type GroupByFunction = ref object of RootObj
Expand Down Expand Up @@ -446,33 +446,21 @@ method value*(self: GroupByMode): Option[PY_ObjectND] =
# =============================================================

proc getGroupByFunction(acc: Accumulator): GroupByFunction =
case acc:
of Accumulator.Max:
return newGroupbyMax()
of Accumulator.Min:
return newGroupbyMin()
of Accumulator.Sum:
return newGroupBySum()
of Accumulator.Product:
return newGroupByProduct()
of Accumulator.First:
return newGroupByFirst()
of Accumulator.Last:
return newGroupByLast()
of Accumulator.Count:
return newGroupByCount()
of Accumulator.CountUnique:
return newGroupByCountUnique()
of Accumulator.Average:
return newGroupByAverage()
of Accumulator.StandardDeviation:
return newGroupByStandardDeviation()
of Accumulator.Median:
return newGroupByMedian()
of Accumulator.Mode:
return newGroupByMode()
else:
raise newException(ValueError, &"unknown accumulator - {acc}")
return (
case acc:
of Max: newGroupbyMax()
of Min: newGroupbyMin()
of Sum: newGroupBySum()
of Product: newGroupByProduct()
of First: newGroupByFirst()
of Last: newGroupByLast()
of Count: newGroupByCount()
of CountUnique: newGroupByCountUnique()
of Average: newGroupByAverage()
of StandardDeviation: newGroupByStandardDeviation()
of Median: newGroupByMedian()
of Mode: newGroupByMode()
)

proc getPages(indices: seq[seq[PY_ObjectND]], columnIndex: int): seq[nimpy.PyObject] =
let
Expand Down Expand Up @@ -642,7 +630,8 @@ proc groupby*(T: nimpy.PyObject, keys: seq[string], functions: seq[(string, Accu
var columnsPaths: OrderedTable[string, seq[string]] = collect(initOrderedTable()):
for cn in columnNames:
{cn: tabliteBase.collectPages(T[cn])}
var pbar = tqdm!(desc: &"groupby", total: len(columnsPaths[toSeq(columnsPaths.keys)[0]]))
var TqdmClass = if tqdm.isNone: m.tqdm.classes.TqdmClass else: tqdm
var pbar = TqdmClass!(desc: &"groupby", total: len(columnsPaths[toSeq(columnsPaths.keys)[0]]))
var aggregationFuncs = initOrderedTable[seq[PY_ObjectND], seq[(string, GroupByFunction)]]()
for pagesZipped in pageZipper(columnsPaths):
for row in iteratePages(pagesZipped):
Expand Down Expand Up @@ -691,61 +680,3 @@ proc groupby*(T: nimpy.PyObject, keys: seq[string], functions: seq[(string, Accu
newTable[cn] = column
discard pbar.close()
return newTable

# when appType != "lib":
# modules().tablite.modules.config.classes.Config.PAGE_SIZE = 1
# let columns = modules().builtins.classes.DictClass!()


# columns["A"] = @[nimValueToPy(nil), nimValueToPy(2), nimValueToPy(2), nimValueToPy(4), nimValueToPy(nil)]
# columns["B"] = @[nimValueToPy(2), nimValueToPy(3), nimValueToPy(4), nimValueToPy(7), nimValueToPy(6)]

# columns["a"] = @[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
# columns["b"] = @[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
# columns["c"] = @[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
# columns["d"] = @[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
# columns["e"] = @[0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
# columns["f"] = @[1, 4, 5, 10, 13, 1, 4, 7, 10, 13]
# columns["g"] = @[0, 1, 8, 27, 64, 0, 1, 8, 27, 64]

# columns["a"] = @[1, 1, 1, 1, 1, 1]
# columns["b"] = @[-2, -1, 0, 1, 2, 3]

# let table = modules().tablite.classes.TableClass!(columns = columns)

# discard table.show()

# var r = table.groupby(keys = @["A"], functions = @[]) # None, 2, 4
# var r = table.groupby(keys = @["A", "B"], functions = @[]) # just like original
# var r = table.groupby(keys = @["A", "B"], functions = @[("A", Accumulator.Min)]) # Min(A) None, 2, 2, 4, None
# var r = table.groupby(keys = @["A", "B"], functions = @[("A", Accumulator.Max)]) # Max(A) None, 2, 2, 4, None
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.Sum)]) # 8, 7, 7
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.Product)]) # 12, 12, 7
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.First)]) # 2, 3, 7
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.Last)]) # 6, 4, 7
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.Count)]) # 2, 2, 1
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.CountUnique)]) # 2, 2, 1
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.Average)]) # 4, 3.5, 7
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.StandardDeviation)]) # 2.8284, 0.7071, 0.0
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.Median)]) # 4, 3.5, 7
# var r = table.groupby(keys = @["A"], functions = @[("B", Accumulator.Mode)]) # 6, 4, 7

# var r = table.groupby(keys = @["a", "b"], functions = @[
# ("f", Accumulator.Max),
# ("f", Accumulator.Min),
# ("f", Accumulator.Sum),
# ("f", Accumulator.Product),
# ("f", Accumulator.First),
# ("f", Accumulator.Last),
# ("f", Accumulator.Count),
# ("f", Accumulator.CountUnique),
# ("f", Accumulator.Average),
# ("f", Accumulator.StandardDeviation),
# ("a", Accumulator.StandardDeviation),
# ("f", Accumulator.Median),
# ("f", Accumulator.Mode),
# ("g", Accumulator.Median),
# ])

# var r = table.groupby(keys = @["a"], functions = @[("b", Accumulator.Max)])
# discard r.show()
3 changes: 2 additions & 1 deletion nimlite/funcs/imputation.nim
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ proc nearestNeighbourImputation*(T: nimpy.PyObject, sources: seq[string],
var
missingValsCounts = collect: (for v in missing_value_index.values(): len(v))
totalSteps = sum(missingValsCounts)
pbar = tqdm!(desc: &"imputation.nearest_neighbour", total: totalSteps)
TqdmClass = if tqdm.isNone: m.tqdm.classes.TqdmClass else: tqdm
pbar = TqdmClass!(desc: &"imputation.nearest_neighbour", total: totalSteps)
ranks: seq[PY_ObjectND] = @[]
newOrder = initTable[seq[int], seq[PY_ObjectND]]()

Expand Down
24 changes: 12 additions & 12 deletions tablite/groupby_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
class GroupBy(object):
max = "max"
min = "min"
sum = "sum"
product = "product"
first = "first"
last = "last"
count = "count"
count_unique = "count_unique"
avg = "avg"
stdev = "stdev"
median = "median"
mode = "mode"
max = "Max"
min = "Min"
sum = "Sum"
product = "Product"
first = "First"
last = "Last"
count = "Count"
count_unique = "CountUnique"
avg = "Average"
stdev = "StandardDeviation"
median = "Median"
mode = "Mode"
19 changes: 2 additions & 17 deletions tablite/pivots.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,6 @@

from tqdm import tqdm as _tqdm


def acc2Name(acc: str) -> str:
arr = ["max", "min", "sum", "product", "first", "last", "count", "median", "mode"]
if acc in arr:
return acc.capitalize()
elif acc == "count_unique":
return "CountUnique"
elif acc == "avg":
return "Average"
elif acc == "stdev":
return "StandardDeviation"
else:
raise ValueError(f"unknown accumulator - {acc}")


def pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):
"""
param: rows: column names to keep as rows
Expand Down Expand Up @@ -143,7 +128,7 @@ def pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=Non
cols = [[] for _ in range(n)]
for row, ix in row_key_index.items():
for col_name, f in functions:
cols[-1].append(f"{acc2Name(f)}({col_name})")
cols[-1].append(f"{f}({col_name})")
for col_ix, v in enumerate(row):
cols[col_ix].append(v)

Expand Down Expand Up @@ -182,7 +167,7 @@ def pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=Non
for f, v in zip(functions, func_key):
agg_col, func = f
terms = ",".join([agg_col] + [f"{col_name}={value}" for col_name, value in zip(columns, col_key)])
col_name = f"{acc2Name(func)}({terms})"
col_name = f"{func}({terms})"
col_name = unique_name(col_name, result.columns)
names.append(col_name)
cols.append([None for _ in range(col_length)])
Expand Down
2 changes: 1 addition & 1 deletion tablite/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
major, minor, patch = 2023, 10, 16
major, minor, patch = 2023, 11, 0
__version_info__ = (major, minor, patch)
__version__ = ".".join(str(i) for i in __version_info__)
Loading

0 comments on commit 27f39c6

Please sign in to comment.