Skip to content

Commit

Permalink
Merge pull request #157 from realratchet/master
Browse files Browse the repository at this point in the history
Fixed some edge cases in filter pagination
  • Loading branch information
realratchet authored Apr 5, 2024
2 parents b20b78a + 47ca063 commit 222d900
Show file tree
Hide file tree
Showing 13 changed files with 347 additions and 504 deletions.
2 changes: 1 addition & 1 deletion .github/actions/buildnim/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ runs:
shell: bash
run: |
nimble -y refresh
nimble -y install nimpy
nimble -y install nimpy dotenv
- name: Set Environment Variables
uses: allenevans/set-env@v2.0.0
with:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ site/

# personal notes
*scratch*.py
*scratch*.nim

# local confidential data
tests/ndata/*
Expand Down
5 changes: 3 additions & 2 deletions nimlite.nimble
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Package

version = "0.2.1"
version = "0.3.0"
author = "Ratchet"
description = "Utilities for tablite to work with nim"
license = "MIT"
Expand All @@ -10,4 +10,5 @@ license = "MIT"
# Dependencies

requires "nim >= 2.0.0"
requires "nimpy >= 0.2.0"
requires "nimpy >= 0.2.0"
requires "dotenv >= 2.0.0"
220 changes: 0 additions & 220 deletions nimlite/funcs/column_selector.nim
Original file line number Diff line number Diff line change
Expand Up @@ -7,223 +7,3 @@ export toPyObj
export collectColumnSelectInfo
export doSliceConvert
export fromPyObjToDesiredInfos

when isMainModule and appType != "lib":

import std/[os, tables, sugar, sets, sequtils, paths, macros]
import nimpy
from ../nimpyext import `!`
import std/options as opt
import ../pymodules
import ../numpy
import typetraits

proc columnSelect(table: nimpy.PyObject, cols: nimpy.PyObject, tqdm: nimpy.PyObject, dir_pid: Path, TaskManager: nimpy.PyObject): (nimpy.PyObject, nimpy.PyObject) =
# this is nim-only implementation, the library build doesn't need it because we need TaskManager to be used for slices
let TableClass = modules().getType(table)
var pbar = tqdm!(total: 100, desc: "column select")
let colInfoResult = collectColumnSelectInfo(table, cols, string dir_pid, pbar)

if toSeq(colInfoResult.isCorrectType.values).all(proc (x: bool): bool = x):
let tblPassColumns = collect(initTable()):
for (desiredName, desiredInfo) in colInfoResult.desiredColumnMap.pairs():
{desiredName: table[desiredInfo.originalName]}

let tblFailColumns = collect(initTable()):
for desiredName in colInfoResult.failedColumnData:
{desiredName: newSeq[nimpy.PyObject]()}

let tblPass = TableClass!(columns: tblPassColumns)
let tblFail = TableClass!(columns: tblFailColumns)

return (tblPass, tblFail)

template ordered2PyDict(keys: seq[string]): nimpy.PyObject =
let dict = modules().builtins.classes.DictClass!()

for k in keys:
dict[k] = newSeq[nimpy.PyObject]()

dict

var tblPass = TableClass!(columns = colInfoResult.passedColumnData.ordered2PyDict())
var tblFail = TableClass!(columns = colInfoResult.failedColumnData.ordered2PyDict())

var taskListInp = collect:
for i in 0..<colInfoResult.pageCount:
let el = collect(initTable()):
for (name, column) in colInfoResult.columns.pairs:
{name: (column[i], colInfoResult.originalPagesMap[name][i])}
(el, colInfoResult.resColsPass[i], colInfoResult.resColsFail[i])

let tabliteConfig = modules().tablite.modules.config.classes.Config
var pageSize = tabliteConfig.PAGE_SIZE.to(int)
var converted = newSeqOfCap[(seq[(string, nimpy.PyObject)], seq[(string, nimpy.PyObject)])](taskListInp.len)
var pbarStep = 45 / max(taskListInp.len - 1, 1)

for (columns, resPass, resFail) in taskListInp:
converted.add(doSliceConvert(dir_pid, pageSize, columns, colInfoResult.rejectReasonName, resPass, resFail, colInfoResult.desiredColumnMap, colInfoResult.columnNames, colInfoResult.isCorrectType))

discard pbar.update(pbarStep)

proc extendTable(table: var nimpy.PyObject, columns: seq[(string, nimpy.PyObject)]): void {.inline.} =
for (col_name, pg) in columns:
let col = table[col_name]

discard col.pages.append(pg) # can't col.extend because nim is dumb :)

for (pg_pass, pg_fail) in converted:
tblPass.extendTable(pg_pass)
tblFail.extendTable(pg_fail)

discard pbar.update(pbar.total.to(float) - pbar.n.to(float))
discard pbar.close()

return (tblPass, tblFail)

proc newColumnSelectorInfo(column: string, `type`: string, allow_empty: bool, rename: opt.Option[string]): nimpy.PyObject =
let pyDict = modules().builtins.classes.DictClass!(
column: column,
type: `type`,
allow_empty: allow_empty
)

if rename.isNone():
pyDict["rename"] = nil
else:
pyDict["rename"] = rename.get()

return pyDict

let tabliteConfig = modules().tablite.modules.config.classes.Config
let workdir = Path(modules().toStr(tabliteConfig.workdir))
let pid = "nim"
let pagedir = workdir / Path(pid) / Path("pages")

createDir(string pagedir)

tabliteConfig.pid = pid
# tabliteConfig.pageSize = 2
# tabliteConfig.MULTIPROCESSING_MODE = tabliteConfig.FALSE

# let columns = modules().builtins.classes.DictClass!({"A ": @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(10), nimValueToPy(200)]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @[1, 22, 333]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @[0, 1]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @["1", "22", "333", "", "abc"]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @["a", "1", "c"], "B": @["d", "e", "f"]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @[nimValueToPy("1"), nimValueToPy("222"), nimValueToPy("333"), nimValueToPy(nil), nimValueToPy("abc")]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @[nimValueToPy(1), nimValueToPy(2.0), nimValueToPy("333"), nimValueToPy("abc")]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @[nimValueToPy(111111), nimValueToPy(222222), nimValueToPy(333333)], "B": @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(2)]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @[nimValueToPy("0"), nimValueToPy(nil), nimValueToPy("2")], "B": @[nimValueToPy("3"), nimValueToPy(nil), nimValueToPy("4")]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"str": @["1", "0"]})
# let columns = modules().builtins.classes.DictClass!({"float": @[1.0, 0.0]})
let columns = modules().builtins.classes.DictClass!({"date": @[
modules().datetime.classes.DateClass!(2000, 1, 1),
modules().datetime.classes.DateClass!(2000, 1, 2),
]})
# let columns = pymodules.builtins().dict({"str": @[nimValueToPy("abc"), nimValueToPy("efg"), nimValueToPy(nil)]}.toTable)
let table = modules().tablite.classes.TableClass!(columns = columns)
let dirdata = os.getEnv("DATA_DIR", ".")
# let table = modules().tablite.fromFile(dirdata & "/gesaber_data_10k.csv")
# let table = modules().tablite.classes.TableClass.load("/media/ratchet/hdd/tablite/filter_0_false.tpz")

# discard table.show(dtype = true)

let select_cols = modules().builtins.classes.ListClass!(@[
# newColumnSelectorInfo("A ",, "int", true, opt.none[string]()),
# newColumnSelectorInfo("A ",, "float", true, opt.none[string]()),
# newColumnSelectorInfo("A ",, "float", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "bool", false, opt.none[string]()),
# newColumnSelectorInfo("A ",, "str", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "date", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "datetime", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "time", false, opt.none[string]()),
# newColumnSelectorInfo("A",, "int", true, opt.none[string]()),
# newColumnSelectorInfo("B",, "str", true, opt.none[string]()),

# newColumnSelectorInfo("str", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("str",, "int", false, opt.some("int")),
# newColumnSelectorInfo("str",, "float", false, opt.some("float")),
# newColumnSelectorInfo("str",, "str", false, opt.some("str")),

# newColumnSelectorInfo("float", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("float",, "int", false, opt.some("int")),
# newColumnSelectorInfo("float",, "float", false, opt.some("float")),
# newColumnSelectorInfo("float",, "str", false, opt.some("str")),
# newColumnSelectorInfo("float", "date", false, opt.some("date")),
# newColumnSelectorInfo("float", "time", false, opt.some("time")),
# newColumnSelectorInfo("float", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("date", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("date",, "int", false, opt.some("int")),
# newColumnSelectorInfo("date",, "float", false, opt.some("float")),
# newColumnSelectorInfo("date",, "str", false, opt.some("str")),
# newColumnSelectorInfo("date", "date", false, opt.some("date")),
# newColumnSelectorInfo("date", "time", false, opt.some("time")),
# newColumnSelectorInfo("date", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("str",, "str", true, opt.some("str")),

# newColumnSelectorInfo("A",, "str", false, opt.none[string]()),
# newColumnSelectorInfo("B",, "int", false, opt.none[string]()),

# newColumnSelectorInfo("A", "int", false, opt.none[string]()),

# newColumnSelectorInfo("date", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("date", "int", false, opt.some("int")),
# newColumnSelectorInfo("date", "float", false, opt.some("float")),
# newColumnSelectorInfo("date", "str", false, opt.some("str")),
# newColumnSelectorInfo("date", "date", false, opt.some("date")),
newColumnSelectorInfo("date", "time", false, opt.some("time")),
newColumnSelectorInfo("date", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("sale_date", "datetime", false, opt.none[string]()),
# newColumnSelectorInfo("cust_nbr", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Order_Number", "str", false, opt.none[string]()),
# newColumnSelectorInfo("prod_slbl", "str", false, opt.none[string]()),
# newColumnSelectorInfo("cases", "int", false, opt.none[string]()),

# newColumnSelectorInfo("Article code", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Article Description", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Department", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Department Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("MC", "str", false, opt.none[string]()),
# newColumnSelectorInfo("MC Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Season", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Season Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Source", "int", false, opt.none[string]()),
# newColumnSelectorInfo("Source Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("X-site artl status", "int", false, opt.none[string]()),
# newColumnSelectorInfo("X-site artl status desc", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Fragile?", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Type (Current)", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Name (Current)", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Type (STO)", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Units per Case (Current)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Units per Case (STO)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Units per pallet\\nLATEST", "float", false, opt.some("Units per pallet")),
# newColumnSelectorInfo("Case L (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case W (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case H (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case Vol (m3)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case Gross Weight (KG)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Inner L (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Weight", "float", false, opt.none[string]()),
# newColumnSelectorInfo("STOs", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Sum of STO Qty", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Pallet Ti", "int", false, opt.none[string]()),
# newColumnSelectorInfo("Pallet Hi", "int", false, opt.none[string]()),
# newColumnSelectorInfo("LAY", "str", false, opt.none[string]()),
])



let (select_pass, select_fail) = table.columnSelect(
select_cols,
nimpy.pyImport("tqdm").tqdm,
dir_pid = workdir / Path(pid),
Taskmanager = modules().mplite.classes.TaskManager
)

discard select_pass.show(dtype = true)
discard select_fail.show(dtype = true)
74 changes: 23 additions & 51 deletions nimlite/funcs/filter.nim
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,18 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
template dumpPage(columns: seq[string], passColumn: nimpy.PyObject, failColumn: nimpy.PyObject): void =
var firstPage = 0
var currentOffset = 0
var maskOffset = 0

template dumpSlice(slice: seq[int], originalPage: BaseNDArray, column: nimpy.PyObject): void =
if slice.len > 0:
let pgid = base.classes.SimplePageClass.next_id(string basedir).to(string)
let pgPath = string(pagedir / Path(pgid & ".npy"))
let page = originalPage[slice]
page.save(pgPath)

let pyPage = newPyPage(page, string basedir, pgid)

discard column.pages.append(pyPage)

while true:
var len = getPageLen(columns[firstPage])
Expand All @@ -225,17 +237,15 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
inc firstPage
currentOffset = currentOffset + len

var maskOffset = 0

let indiceOffset = offset - currentOffset
var indiceOffset = offset - currentOffset
var maskLeftOver = bitNum - maskOffset

while maskOffset < bitNum:
while maskLeftOver > 0:
let page = readNumpy(columns[firstPage])

let len = page.len
let sliceMax = min((bitNum - maskOffset), len)
let sliceLen = sliceMax - maskOffset
let slice = maskOffset..<sliceMax
let len = (page.len - indiceOffset)
let sliceLen = min(maskLeftOver, len)
let slice = maskOffset..<(sliceLen + maskOffset)

var validIndices = newSeqOfCap[int](sliceLen - (sliceLen shr 2))
var invalidIndices = newSeqOfCap[int](sliceLen shr 2)
Expand All @@ -244,25 +254,12 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
if m: validIndices.add(i + indiceOffset)
else: invalidIndices.add(i + indiceOffset)

let passPid = base.classes.SimplePageClass.next_id(string basedir).to(string)
let failPid = base.classes.SimplePageClass.next_id(string basedir).to(string)

let passPath = string(pagedir / Path(passPid & ".npy"))
let failPath = string(pagedir / Path(failPid & ".npy"))

let passPage = page[validIndices]
let failPage = page[invalidIndices]

passPage.save(passPath)
failPage.save(failPath)

let passPagePy = newPyPage(passPage, string basedir, passPid)
let failPagePy = newPyPage(failPage, string basedir, failPid)

discard passColumn.pages.append(passPagePy)
discard failColumn.pages.append(failPagePy)
validIndices.dumpSlice(page, passColumn)
invalidIndices.dumpSlice(page, failColumn)

maskOffset = maskOffset + sliceLen
maskLeftOver = maskLeftOver - sliceLen
indiceOffset = 0
inc firstPage

template dumpPages(tablePages: Table[string, seq[string]]): void =
Expand All @@ -272,7 +269,7 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
let tableLen = builtins.getLen(table)
let tqdmLen = int ceil(float(tableLen) / float(pageSize))
let TqdmClass = (if isNone(tqdm): m.tqdm.classes.TqdmClass else: tqdm)
let pbar = TqdmClass!(total: tqdmLen, desc="filter")
let pbar = TqdmClass!(total: tqdmLen, desc = "filter")

for (i, row) in enumerate(exprCols.iterateRows(tablePages)):
bitmask[bitNum] = row.checkExpressions(exprCols, expressions, filterType)
Expand Down Expand Up @@ -303,28 +300,3 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
discard pbar.close()

return (passTable, failTable)


when appType != "lib":
let m = modules()
let Config = m.tablite.modules.config.classes.Config

# Config.PAGE_SIZE = 2

let table = m.tablite.classes.TableClass!({
"a": @[1, 2, 3, 4],
"b": @[10, 20, 30, 40],
"c": @[4, 4, 4, 4]
}.toTable)
let pyExpressions = @[
m.builtins.classes.DictClass!(column1: "a", criteria: ">=", value2: 2),
# m.builtins.classes.DictClass!(column1: "b", criteria: "==", value2: 20),
m.builtins.classes.DictClass!(column1: "a", criteria: "==", column2: "c"),
]

Config.PAGE_SIZE = 2

let (tblPass, tblFail) = filter(table, pyExpressions, "all", nil)

discard tblPass.show()
discard tblFail.show()
Loading

0 comments on commit 222d900

Please sign in to comment.