diff --git a/playground.ipynb b/playground.ipynb deleted file mode 100644 index f398ba1..0000000 --- a/playground.ipynb +++ /dev/null @@ -1,215 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import duckdb\n", - "import pandas as pd\n", - "import pyarrow.parquet as pq\n", - "from itertools import islice, tee\n", - "\n", - "# from src.core import Data, Reader" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "d = \"/home/anadirov/Documents/big.parquet\"\n", - "d2 = \"/home/anadirov/Documents/local_dwh/mp_all_offers_prod.parquet\"\n", - "# pd_df = pd.read_parquet(d)\n", - "\n", - "# q = f\"\"\"\n", - "# create table data as select * from \"{d}\"\n", - "# \"\"\".strip()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "df = duckdb.read_parquet(d)\n", - "df2 = duckdb.read_parquet(d2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# parquet_file = pq.ParquetFile(d)\n", - "# gen = parquet_file.iter_batches(batch_size=1000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def nth_item(generator, n):\n", - " return next(islice(generator, n, n+1), None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def count_items(generator):\n", - " return sum(1 for _ in generator)\n", - "\n", - "def count_and_copy(generator):\n", - " gen_copy, gen_count = tee(generator)\n", - " count = sum(1 for _ in gen_count)\n", - " return gen_copy, count\n", - "\n", - "\n", - "# count_and_copy(gen)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# r = df.query(\"data\", \"select * from data limit 10000\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# bats = r.to_arrow_table().to_batches(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# big = Data(path = d, \n", - "# virtual_table_name = \"data\", \n", - "# query = \"select item_id, __index_level_0__ from data order by __index_level_0__\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "from io import StringIO\n", - "import sys, os\n", - "\n", - "def render_df_info(duckdf: duckdb.DuckDBPyRelation) -> str:\n", - " \"\"\" returns md like formatted df.info \"\"\"\n", - " shape = duckdf.shape\n", - "\n", - " output_buffer = StringIO()\n", - " sys.stdout = output_buffer\n", - " duckdf.describe().show(max_width=10**19)\n", - " sys.stdout = sys.__stdout__\n", - " descr = output_buffer.getvalue()\n", - "\n", - " h = f\"### Rows: {shape[0]}, Columns: {shape[1]}\\n{'-'*50}\\n\"\n", - " try:\n", - " lines = descr.strip().split(\"\\n\")\n", - " headers = lines[1].strip(\"│\").split(\"│\")\n", - " headers = [header.strip() for header in headers]\n", - " types = lines[2].strip(\"│\").split(\"│\")\n", - " types = [t.strip() for t in types]\n", - " data_lines = lines[4:-1]\n", - " data = []\n", - " for line in data_lines:\n", - " row = line.strip(\"│\").split(\"│\")\n", - " row = [item.strip() for item in row]\n", - " data.append(row)\n", - "\n", - " # Build the markdown table\n", - " markdown_table = \"| \" + \" | \".join(headers) + \" |\\n\"\n", - " markdown_table += \"|-\" + \"-|-\".join([\"-\" * len(header) for header in headers]) + \"-|\\n\"\n", - " markdown_table += \"| \" + \" | \".join(types) + \" |\\n\"\n", - " for row in data:\n", - " markdown_table += \"| \" + \" | \".join(row) + \" |\\n\"\n", - "\n", - " return h + markdown_table\n", - " \n", - " except Exception as e:\n", - " return h\n", - "\n", - "\n", - "r = render_df_info(df2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}