diff --git a/.github/actions/buildnim/action.yml b/.github/actions/buildnim/action.yml index b2aa3baa..d8fb03e9 100644 --- a/.github/actions/buildnim/action.yml +++ b/.github/actions/buildnim/action.yml @@ -10,7 +10,7 @@ runs: shell: bash run: | nimble -y refresh - nimble -y install nimpy + nimble -y install nimpy dotenv - name: Set Environment Variables uses: allenevans/set-env@v2.0.0 with: diff --git a/.gitignore b/.gitignore index 849483e1..685f6158 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ site/ # personal notes *scratch*.py +*scratch*.nim # local confidential data tests/ndata/* diff --git a/nimlite.nimble b/nimlite.nimble index 19ba4799..ef6ac9e7 100644 --- a/nimlite.nimble +++ b/nimlite.nimble @@ -1,6 +1,6 @@ # Package -version = "0.2.1" +version = "0.3.0" author = "Ratchet" description = "Utilities for tablite to work with nim" license = "MIT" @@ -10,4 +10,5 @@ license = "MIT" # Dependencies requires "nim >= 2.0.0" -requires "nimpy >= 0.2.0" \ No newline at end of file +requires "nimpy >= 0.2.0" +requires "dotenv >= 2.0.0" \ No newline at end of file diff --git a/nimlite/funcs/column_selector.nim b/nimlite/funcs/column_selector.nim index 96d17e4a..7fbc5ee5 100644 --- a/nimlite/funcs/column_selector.nim +++ b/nimlite/funcs/column_selector.nim @@ -7,223 +7,3 @@ export toPyObj export collectColumnSelectInfo export doSliceConvert export fromPyObjToDesiredInfos - -when isMainModule and appType != "lib": - - import std/[os, tables, sugar, sets, sequtils, paths, macros] - import nimpy - from ../nimpyext import `!` - import std/options as opt - import ../pymodules - import ../numpy - import typetraits - - proc columnSelect(table: nimpy.PyObject, cols: nimpy.PyObject, tqdm: nimpy.PyObject, dir_pid: Path, TaskManager: nimpy.PyObject): (nimpy.PyObject, nimpy.PyObject) = - # this is nim-only implementation, the library build doesn't need it because we need TaskManager to be used for slices - let TableClass = modules().getType(table) - var pbar = tqdm!(total: 100, desc: "column select") - let colInfoResult = collectColumnSelectInfo(table, cols, string dir_pid, pbar) - - if toSeq(colInfoResult.isCorrectType.values).all(proc (x: bool): bool = x): - let tblPassColumns = collect(initTable()): - for (desiredName, desiredInfo) in colInfoResult.desiredColumnMap.pairs(): - {desiredName: table[desiredInfo.originalName]} - - let tblFailColumns = collect(initTable()): - for desiredName in colInfoResult.failedColumnData: - {desiredName: newSeq[nimpy.PyObject]()} - - let tblPass = TableClass!(columns: tblPassColumns) - let tblFail = TableClass!(columns: tblFailColumns) - - return (tblPass, tblFail) - - template ordered2PyDict(keys: seq[string]): nimpy.PyObject = - let dict = modules().builtins.classes.DictClass!() - - for k in keys: - dict[k] = newSeq[nimpy.PyObject]() - - dict - - var tblPass = TableClass!(columns = colInfoResult.passedColumnData.ordered2PyDict()) - var tblFail = TableClass!(columns = colInfoResult.failedColumnData.ordered2PyDict()) - - var taskListInp = collect: - for i in 0.. 0: + let pgid = base.classes.SimplePageClass.next_id(string basedir).to(string) + let pgPath = string(pagedir / Path(pgid & ".npy")) + let page = originalPage[slice] + page.save(pgPath) + + let pyPage = newPyPage(page, string basedir, pgid) + + discard column.pages.append(pyPage) while true: var len = getPageLen(columns[firstPage]) @@ -225,17 +237,15 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy inc firstPage currentOffset = currentOffset + len - var maskOffset = 0 - - let indiceOffset = offset - currentOffset + var indiceOffset = offset - currentOffset + var maskLeftOver = bitNum - maskOffset - while maskOffset < bitNum: + while maskLeftOver > 0: let page = readNumpy(columns[firstPage]) - let len = page.len - let sliceMax = min((bitNum - maskOffset), len) - let sliceLen = sliceMax - maskOffset - let slice = maskOffset..=", value2: 2), - # m.builtins.classes.DictClass!(column1: "b", criteria: "==", value2: 20), - m.builtins.classes.DictClass!(column1: "a", criteria: "==", column2: "c"), - ] - - Config.PAGE_SIZE = 2 - - let (tblPass, tblFail) = filter(table, pyExpressions, "all", nil) - - discard tblPass.show() - discard tblFail.show() diff --git a/nimlite/funcs/imputation.nim b/nimlite/funcs/imputation.nim index ca12d314..b4f3a8ca 100644 --- a/nimlite/funcs/imputation.nim +++ b/nimlite/funcs/imputation.nim @@ -265,35 +265,3 @@ proc nearestNeighbourImputation*(T: nimpy.PyObject, sources: seq[string], discard pbar.close() return newTable - -when appType != "lib": - modules().tablite.modules.config.classes.Config.PAGE_SIZE = 1 - let columns = modules().builtins.classes.DictClass!() - # 1 - columns["A"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(nil), nimValueToPy(3), nimValueToPy(0)] - columns["B"] = @[nimValueToPy("4"), nimValueToPy(5), nimValueToPy(6), nimValueToPy(7), nimValueToPy(4)] - - # 2 - # columns["a"] = @[1, 1, 5, 5, 6, 6] - # columns["b"] = @[2, 2, 5, 5, 6, -1] - # columns["c"] = @[nimValueToPy(3), nimValueToPy(nil), nimValueToPy(5), nimValueToPy("NULL"), nimValueToPy(6), nimValueToPy(6)] - - # 3 - # columns["a"] = @[nimValueToPy(nil), nimValueToPy(1), nimValueToPy(2), nimValueToPy(3)] - # columns["b"] = @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(2), nimValueToPy(3)] - # columns["c"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(nil), nimValueToPy(3)] - # columns["d"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(2), nimValueToPy(nil)] - - let table = modules().tablite.classes.TableClass!(columns = columns) - - discard table.show() - - # echo index(table, @["A", "B"]) - # 1 - var r = nearestNeighbourImputation(table, @["A", "B"], @[PY_ObjectND(PY_None)], @["A"]) - # 2 - # var r = nearestNeighbourImputation(table, @["a", "b", "c"], @[PY_ObjectND(PY_None), newPY_Object("NULL"), newPY_Object(-1)], @["b", "c"]) - # 3 - # var r = nearestNeighbourImputation(table, @["a", "b", "c", "d"], @[PY_ObjectND(PY_None)], @["a", "b", "c", "d"]) - - discard r.show() diff --git a/nimlite/funcs/text_reader.nim b/nimlite/funcs/text_reader.nim index a60b910f..2097a107 100644 --- a/nimlite/funcs/text_reader.nim +++ b/nimlite/funcs/text_reader.nim @@ -13,163 +13,3 @@ export text_reader export table export encfile export csvparse - -when isMainModule and appType != "lib": - import nimpy - import std/[paths, osproc, times, enumerate, strutils, options, os] - import ../[numpy, pymodules, nimpyext] - - proc toTaskArgs*(path: string, encoding: string, dialect: TabliteDialect, task: TabliteTask, import_fields: seq[uint], guess_dtypes: bool, skip_empty: string): TaskArgs {.inline.} = - return toTaskArgs( - path = path, - encoding = encoding, - dia_delimiter = dialect.delimiter, - dia_quotechar = dialect.quotechar, - dia_escapechar = dialect.escapechar, - dia_doublequote = dialect.doublequote, - dia_quoting = dialect.quoting, - dia_skipinitialspace = dialect.skipinitialspace, - dia_skiptrailingspace = dialect.skiptrailingspace, - dia_lineterminator = dialect.lineterminator, - dia_strict = dialect.strict, - guess_dtypes = guess_dtypes, - tsk_pages = task.pages, - tsk_offset = task.offset, - tsk_count = task.count, - import_fields = import_fields, - skip_empty = skip_empty - ) - - proc runTask*(taskArgs: TaskArgs, pageInfo: PageInfo): void = - discard taskArgs.textReaderTask(pageInfo) - - proc executeParallel*(path: string): void = - echo "Executing tasks: '" & path & "'" - let args = @[ - "--progress", - "-a", - "\"" & path & "\"" - ] - - let para = "/usr/bin/parallel" - let ret_code = execCmd(para & " " & args.join(" ")) - - if ret_code != 0: - raise newException(Exception, "Process failed with errcode: " & $ret_code) - - proc importFile*( - pid: string, taskname: string, path: string, encoding: FileEncoding, dialect: Dialect, - cols: Option[seq[string]], first_row_has_headers: bool, header_row_index: uint, - page_size: uint, guess_dtypes: bool, skipEmpty: SkipEmpty, - start: Option[int], limit: Option[int] - ): PyObject = - let d0 = getTime() - - let table = importTextFile(pid, path, encoding, dialect, cols, first_row_has_headers, header_row_index, page_size, guess_dtypes, skipEmpty, start, limit) - let task = table.task - - for i, column_task in enumerate(task.tasks): - let taskArgs = toTaskArgs(task.path, task.encoding, task.dialect, column_task, task.import_fields, task.guess_dtypes, $skipEmpty) - let pageInfo = taskArgs.collectPageInfoTask() - - taskArgs.runTask(pageInfo) - echo "Dumped " & $(i + 1) & "/" & $task.tasks.len - - let d1 = getTime() - echo $(d1 - d0) - - let pyTable = modules().tablite.classes.TableClass!() - - for cInfo in table.columns: - let pyColumn = modules().tablite.modules.base.classes.ColumnClass!(pid) - pyTable[cInfo.name] = pyColumn - - for page in cInfo.pages: - let pPage = Path(page) - - let workdir = string pPage.parentDir.parentDir - let id = string pPage.extractFilename.changeFileExt("") - - let len = getPageLen(page) - let dtypes = getPageTypes(page) - let page = newPyPage(id, workdir, len, dtypes) - - discard pyColumn.pages.append(page) - - return pyTable - - let m = modules() - let Config = m.tablite.modules.config.classes.Config - let workdir = Path(m.toStr(Config.workdir)) - - var path_csv: string - var encoding = str2Enc($ENC_UTF8) - var dialect: Dialect - var cols = none[seq[string]]() - var guess_dtypes = true - var pid = string (workdir / Path("nim")) - var taskname = "task" - var page_size = uint Config.PAGE_SIZE.to(int) - - var delimiter = ',' - var quotechar = '"' - var escapechar = '\\' - var lineterminator = '\n' - var doublequote = true - var quoting = QUOTE_MINIMAL - - var skipinitialspace = false - var skiptrailingspace = false - var skipEmpty = SkipEmpty.ALL - - dialect = newDialect( - delimiter = delimiter, - quotechar = quotechar, - escapechar = escapechar, - doublequote = doublequote, - quoting = quoting, - skipinitialspace = skipinitialspace, - skiptrailingspace = skiptrailingspace, - lineterminator = lineterminator, - ) - - let dirdata = os.getEnv("DATA_DIR", ".") - - # (path_csv, encoding) = ("tests/data/split_lines.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = (dirdata & "/Dealz Poland v1.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/floats.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/bad_empty.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/book1.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/detect_misalignment.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = (dirdata & "/Ritual B2B orderlines updated.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = (dirdata & "/Ritual B2B orderlines_small.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/utf16_test.csv", str2Enc($ENC_UTF16)) - # (path_csv, encoding) = ("tests/data/win1250_test.csv", str2ConvEnc("Windows-1252")) - - # (path_csv, encoding) = ("tests/data/book1.txt", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/gdocs1.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/with_empty_lines.csv", str2Enc($ENC_UTF8)) - (path_csv, encoding) = ("tests/data/with_escape.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = (dirdata & "/Dematic YDC Order Data.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = (dirdata & "/Dematic YDC Order Data_1M.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = (dirdata & "/Dematic YDC Order Data_1M_1col.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = (dirdata & "/gesaber_data.csv", str2Enc($ENC_UTF8)) - # (path_csv, encoding) = ("tests/data/utf16_be.csv", str2Enc($ENC_UTF16)) - # (path_csv, encoding) = ("tests/data/utf16_le.csv", str2Enc($ENC_UTF16)) - - # cols = some(@["\"Item\"", "\"Materiál\"", "\"Objem\"", "\"Jednotka objemu\"", "\"Free Inv Pcs\""]) - # dialect.quoting = Quoting.QUOTE_NONE - # dialect.delimiter = ';' - - let start = some[int](0) - let limit = some[int](-1) - let first_row_has_headers = true - let header_row_index = uint 0 - - guess_dtypes = true - # cols = some(@["a", "c"]) - # page_size = 2 - - let pyTable = importFile(pid, taskname, path_csv, encoding, dialect, cols, first_row_has_headers, header_row_index, page_size, guess_dtypes, skipEmpty, start, limit) - - discard pyTable.show(dtype=true) diff --git a/nimlite/numpy.nim b/nimlite/numpy.nim index 532f000a..015d7c34 100644 --- a/nimlite/numpy.nim +++ b/nimlite/numpy.nim @@ -1462,36 +1462,4 @@ proc index*(table: nimpy.PyObject, columnNames: openArray[string]): TableIndices d[row] = newSeq[int]() d[row].add(ix) inc ix - return d - -when isMainModule and appType != "lib": - let tabliteConfig = modules().tablite.modules.config.classes.Config - # let workdir = Path(modules().toStr(tabliteConfig.workdir)) - let pid = "nim" - # let pagedir = workdir / Path(pid) / Path("pages") - - echo readNumpy("tests/data/pages/scalar.npy").len - - # createDir(string pagedir) - - tabliteConfig.pid = pid - tabliteConfig.PAGE_SIZE = 2 - - let columns = modules().builtins.classes.DictClass!({"A": @["1", "22", "333", "4444", "55555", "666666", "7777777"]}.toTable) - let table = modules().tablite.classes.TableClass!(columns = columns) - let pages = collect: (for p in table["A"].pages: modules().toStr(p.path)) - - let newPages = repaginate(pages) - - echo newPages - - for i in toSeq(iterateColumn[string](table["A"])): - echo i - - echo newNDArray[DateNDArray](@[now().utc]) - echo newNDArray[DateTimeNDArray](@[now().utc]) - echo newNDArray(@[false, false, true]) - echo newNDArray(@[1, 2, 3]) - echo newNDArray(@[1.0, 2.0, 3.0]) - echo newNDArray(@["a", "bb", "ccc"]) - echo newNDArray(@[newPY_Object()]) + return d \ No newline at end of file diff --git a/nimlite/pymodules.nim b/nimlite/pymodules.nim index f7b95807..c4296030 100644 --- a/nimlite/pymodules.nim +++ b/nimlite/pymodules.nim @@ -1,4 +1,13 @@ -from std/os import getEnv +import dotenv +import nimpy/py_lib +from std/os import getEnv, existsEnv, fileExists + +if fileExists("./.env"): + load() + +if existsEnv("LIB_PYTHON"): + pyInitLibPath(getEnv("LIB_PYTHON")) + from std/strutils import split from std/sugar import collect from nimpyext import `!` @@ -151,7 +160,7 @@ proc getLen*(inst: PyModule[PyBuiltins], obj: PyObject): int {.inline.} = inst.m proc fromFile*(inst: PyModule[PyTablite], path: string): PyObject {.inline.} = inst.classes.TableClass.from_file(path) proc collectPages*(inst: PyModule[PyTabliteBase], column: PyObject): seq[string] {.inline.} = let builtins = modules().builtins - + if not builtins.isinstance(column, inst.classes.ColumnClass): raise newException(ValueError, "not a column") @@ -168,7 +177,6 @@ proc toStr*(self: PyModules, obj: PyObject): string {.inline.} = self.builtins.t proc toRepr*(self: PyModules, obj: PyObject): string {.inline.} = self.builtins.toRepr(obj) proc getLen*(self: PyModules, obj: PyObject): int {.inline.} = self.builtins.getLen(obj) - proc isNone*(obj: PyObject): bool {.inline.} = modules().builtins.isinstance(obj, py.get.builtins.classes.NoneTypeClass) proc `in`*[T](a: T, b: nimpy.PyObject): bool {.inline.} = let m = modules() @@ -184,4 +192,4 @@ proc contains*[T](a: T, b: nimpy.PyObject): bool {.inline.} = return isIn.to(bool) -proc `notin`*[T](a: T, b: nimpy.PyObject): bool {.inline.} = return not (a in b) \ No newline at end of file +proc `notin`*[T](a: T, b: nimpy.PyObject): bool {.inline.} = return not (a in b) diff --git a/tablite/version.py b/tablite/version.py index 729223e0..6538d607 100644 --- a/tablite/version.py +++ b/tablite/version.py @@ -1,3 +1,3 @@ -major, minor, patch = 2023, 10, 15 +major, minor, patch = 2023, 10, 16 __version_info__ = (major, minor, patch) __version__ = ".".join(str(i) for i in __version_info__) diff --git a/tests/test_filter.nim b/tests/test_filter.nim new file mode 100644 index 00000000..a8d37f4c --- /dev/null +++ b/tests/test_filter.nim @@ -0,0 +1,131 @@ +import nimpy +import std/[unittest, tables, sugar] +import ../nimlite/funcs/filter +import ../nimlite/[pymodules, nimpyext] + + +proc valueFilter1*(): auto = + let m = modules() + let table = m.tablite.classes.TableClass!({ + "a": @[1, 2, 3, 4], + "b": @[10, 20, 30, 40], + "c": @[4, 4, 4, 4] + }.toTable) + + let pyExpressions = @[ + m.builtins.classes.DictClass!(column1: "a", criteria: ">=", value2: 2), + ] + + return filter(table, pyExpressions, "all", nil) + +proc valueFilter2*(): auto = + let m = modules() + + let table = m.tablite.classes.TableClass!({ + "a": @[1, 2, 3, 4], + "b": @[10, 20, 30, 40], + "c": @[4, 4, 4, 4] + }.toTable) + let pyExpressions = @[ + m.builtins.classes.DictClass!(column1: "a", criteria: ">=", value2: 2), + m.builtins.classes.DictClass!(column1: "a", criteria: "==", column2: "c"), + ] + + return filter(table, pyExpressions, "all", nil) + +proc valueFilter3*(): auto = + let m = modules() + + let table = m.tablite.classes.TableClass!({ + "a": @[1, 2, 3, 4], + "b": @[10, 20, 30, 40], + "c": @[4, 4, 4, 4] + }.toTable) + let pyExpressions = @[ + m.builtins.classes.DictClass!(column1: "a", criteria: ">=", value2: 2), + m.builtins.classes.DictClass!(column1: "a", criteria: "==", column2: "c"), + ] + + return filter(table, pyExpressions, "all", nil) + +proc valueFragmentation*(inpPagesSize: int, outPagesSize: int): auto = + let m = modules() + let Config = m.tablite.modules.config.classes.Config + + Config.PAGE_SIZE = inpPagesSize + + let table = m.tablite.classes.TableClass!({ + "a": @[1, 2, 3, 4], + "b": @[10, 20, 30, 40], + "c": @[4, 4, 4, 4] + }.toTable) + + Config.PAGE_SIZE = outPagesSize + + let pyExpressions = @[ + m.builtins.classes.DictClass!(column1: "a", criteria: ">=", value2: 2), + ] + + return filter(table, pyExpressions, "all", nil) + +proc valueFragmentationCheck(inpPagesSize: int, outPagesSize: int): void = + let m = modules() + let (tblPass, tblFail) = valueFragmentation(inpPagesSize, outPagesSize) + + check m.builtins.getLen(tblPass) == 3 + check m.builtins.getLen(tblFail) == 1 + + let vaPass = collect: (for v in tblPass["a"]: v.to(int)) + let vbPass = collect: (for v in tblPass["b"]: v.to(int)) + let vcPass = collect: (for v in tblPass["c"]: v.to(int)) + + let vaFail = collect: (for v in tblFail["a"]: v.to(int)) + let vbFail = collect: (for v in tblFail["b"]: v.to(int)) + let vcFail = collect: (for v in tblFail["c"]: v.to(int)) + + check vaPass == @[2, 3, 4] + check vbPass == @[20, 30, 40] + check vcPass == @[4, 4, 4] + + check vaFail == @[1] + check vbFail == @[10] + check vcFail == @[4] + +when not defined(DEV_BUILD): + test "value filter": + let m = modules() + let (tblPass, tblFail) = valueFilter1() + + check m.builtins.getLen(tblPass) == 3 + check m.builtins.getLen(tblFail) == 1 + + let vaPass = collect: (for v in tblPass["a"]: v.to(int)) + let vbPass = collect: (for v in tblPass["b"]: v.to(int)) + let vcPass = collect: (for v in tblPass["c"]: v.to(int)) + + let vaFail = collect: (for v in tblFail["a"]: v.to(int)) + let vbFail = collect: (for v in tblFail["b"]: v.to(int)) + let vcFail = collect: (for v in tblFail["c"]: v.to(int)) + + check vaPass == @[2, 3, 4] + check vbPass == @[20, 30, 40] + check vcPass == @[4, 4, 4] + + check vaFail == @[1] + check vbFail == @[10] + check vcFail == @[4] + + test "value filter fragmented 2->2": + valueFragmentationCheck(2, 2) + + test "value filter fragmented 2->1M": + valueFragmentationCheck(2, 1_000_000) + + test "value filter fragmented 1M->2": + valueFragmentationCheck(1_000_000, 2) + + test "value filter fragmented 3->2": + valueFragmentationCheck(3, 2) + + test "value filter fragmented 3->1M": + valueFragmentationCheck(3, 1_000_000) diff --git a/tests/test_imputation.nim b/tests/test_imputation.nim new file mode 100644 index 00000000..66fb2c3d --- /dev/null +++ b/tests/test_imputation.nim @@ -0,0 +1,115 @@ +import nimpy +import std/[unittest, sugar] +import ../nimlite/funcs/imputation +import ../nimlite/[pymodules, nimpyext, pytypes] + +test "imp1": + let columns = modules().builtins.classes.DictClass!() + + columns["A"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(nil), nimValueToPy(3), nimValueToPy(0)] + columns["B"] = @[nimValueToPy(4), nimValueToPy(5), nimValueToPy(6), nimValueToPy(7), nimValueToPy(4)] + + let table = modules().tablite.classes.TableClass!(columns) + let r = nearestNeighbourImputation(table, @["A", "B"], @[PY_ObjectND(PY_None)], @["A"]) + + let impA = collect: (for v in r["A"]: v.to(int)) + let impB = collect: (for v in r["B"]: v.to(int)) + + check len(impA) == 5 + check len(impB) == 5 + + check @[0, 1, 1, 3, 0] == impA + check @[4, 5, 6, 7, 4] == impB + +test "imp1 - multipage": + modules().tablite.modules.config.classes.Config.PAGE_SIZE = 1 + let columns = modules().builtins.classes.DictClass!() + + columns["A"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(nil), nimValueToPy(3), nimValueToPy(0)] + columns["B"] = @[nimValueToPy(4), nimValueToPy(5), nimValueToPy(6), nimValueToPy(7), nimValueToPy(4)] + + let table = modules().tablite.classes.TableClass!(columns) + let r = nearestNeighbourImputation(table, @["A", "B"], @[PY_ObjectND(PY_None)], @["A"]) + + let impA = collect: (for v in r["A"]: v.to(int)) + let impB = collect: (for v in r["B"]: v.to(int)) + + check len(impA) == 5 + check len(impB) == 5 + + check @[0, 1, 1, 3, 0] == impA + check @[4, 5, 6, 7, 4] == impB + +test "imp2": + let columns = modules().builtins.classes.DictClass!() + + columns["A"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(nil), nimValueToPy(3), nimValueToPy(0)] + columns["B"] = @[nimValueToPy("4"), nimValueToPy(5), nimValueToPy(6), nimValueToPy(7), nimValueToPy(4)] + + let table = modules().tablite.classes.TableClass!(columns) + let r = nearestNeighbourImputation(table, @["A", "B"], @[PY_ObjectND(PY_None)], @["A"]) + + let impA = collect: (for v in r["A"]: v.to(int)) + let impB = collect: (for v in r["B"]: v) + + check len(impA) == 5 + check len(impB) == 5 + + check @[0, 1, 3, 3, 0] == impA + check impB[0].to(string) == "4" + check impB[1].to(int) == 5 + check impB[2].to(int) == 6 + check impB[3].to(int) == 7 + check impB[4].to(int) == 4 + +test "imp3": + let columns = modules().builtins.classes.DictClass!() + + columns["a"] = @[1, 1, 5, 5, 6, 6] + columns["b"] = @[2, 2, 5, 5, 6, -1] + columns["c"] = @[nimValueToPy(3), nimValueToPy(nil), nimValueToPy(5), nimValueToPy("NULL"), nimValueToPy(6), nimValueToPy(6)] + + let table = modules().tablite.classes.TableClass!(columns) + let r = nearestNeighbourImputation(table, @["a", "b", "c"], @[PY_ObjectND(PY_None), newPY_Object("NULL"), newPY_Object(-1)], @["b", "c"]) + + let impA = collect: (for v in r["a"]: v.to(int)) + let impB = collect: (for v in r["b"]: v.to(int)) + let impC = collect: (for v in r["c"]: v.to(int)) + + check len(impA) == 6 + check len(impB) == 6 + check len(impC) == 6 + + check @[1, 1, 5, 5, 6, 6] == impA + check @[2, 2, 5, 5, 6, 6] == impB + check @[3, 3, 5, 5, 6, 6] == impC + +test "imp4": + let columns = modules().builtins.classes.DictClass!() + + columns["a"] = @[nimValueToPy(nil), nimValueToPy(1), nimValueToPy(2), nimValueToPy(3)] + columns["b"] = @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(2), nimValueToPy(3)] + columns["c"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(nil), nimValueToPy(3)] + columns["d"] = @[nimValueToPy(0), nimValueToPy(1), nimValueToPy(2), nimValueToPy(nil)] + + let table = modules().tablite.classes.TableClass!(columns) + let r = nearestNeighbourImputation(table, @["a", "b", "c", "d"], @[PY_ObjectND(PY_None)], @["a", "b", "c", "d"]) + + let impA = collect: (for v in r["a"]: v) + let impB = collect: (for v in r["b"]: v.to(int)) + let impC = collect: (for v in r["c"]: v.to(int)) + let impD = collect: (for v in r["d"]: v.to(int)) + + check len(impA) == 4 + check len(impB) == 4 + check len(impC) == 4 + check len(impD) == 4 + + check impA[0].isNone + check impA[1].to(int) == 1 + check impA[2].to(int) == 2 + check impA[3].to(int) == 3 + + check @[0, 0, 2, 3] == impB + check @[0, 1, 0, 3] == impC + check @[0, 1, 2, 0] == impD \ No newline at end of file diff --git a/tests/test_numpy.nim b/tests/test_numpy.nim new file mode 100644 index 00000000..96e4a24d --- /dev/null +++ b/tests/test_numpy.nim @@ -0,0 +1,59 @@ +import nimpy +import std/[unittest, tables, sugar, sequtils, times] +import ../nimlite/[pymodules, nimpyext, numpy, pytypes] + +test "scalar": + check readNumpy("tests/data/pages/scalar.npy").len == 110234 + +test "repaginate": + let m = modules() + let Config = m.tablite.modules.config.classes.Config + + Config.PAGE_SIZE = 2 + + let inpElems = @["1", "22", "333", "4444", "55555", "666666", "7777777"] + let columns = m.builtins.classes.DictClass!({"A": inpElems}.toTable) + let table = m.tablite.classes.TableClass!(columns = columns) + let pages = collect: (for p in table["A"].pages: m.toStr(p.path)) + + check pages.len == 4 + + Config.PAGE_SIZE = 1_000_000 + + let newPages = repaginate(pages) + + check newPages.len == 1 + + let page = UnicodeNDArray(readNumpy(m.toStr(newPages[0].path))) + let outElems = collect: (for i in 0..