Use numpy.datetime64 dtypes (#16)

At the cost of some speed (~3x slower than pandas now) gives a good boost to memory efficiency.
milesgranger · Nov 7, 2021 · ada1e1e · ada1e1e
1 parent 0f00054
commit ada1e1e
Show file tree

Hide file tree

Showing 8 changed files with 172 additions and 323 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,11 +9,13 @@ license = "Unlicense/MIT"
 crate-type = ["staticlib"]
 
 [dependencies]
+bumpalo      = "3.8.0"
 uuid         = "0.8.2"
 serde_json   = "1.0.68"
 rust_decimal = { version = "1.16.0", features = ["db-postgres"] }
 time         = { version = "0.3.3",  features = ["formatting"] }
 postgres     = { version = "0.19.1", features = ["with-serde_json-1", "with-time-0_3", "with-uuid-0_8"] }
+postgres-protocol = "0.6.2"
 
 [build-dependencies]
 cbindgen = "^0.6.0"

diff --git a/README.md b/README.md
@@ -6,36 +6,37 @@
 ![PyPI - Wheel](https://img.shields.io/pypi/wheel/flaco)
 [![Downloads](https://pepy.tech/badge/flaco/month)](https://pepy.tech/project/flaco)
 
-Perhaps the fastest and most memory efficient way to
+Perhaps the fastest* and most memory efficient way to
 pull data from PostgreSQL into [pandas](https://pandas.pydata.org/) 
 and [numpy](https://numpy.org/doc/stable/index.html). 🚀
 
 Have a gander at the initial [benchmarks](./benchmarks) 🏋
 
 flaco tends to use nearly ~3-6x less memory than standard `pandas.read_sql` 
-and about ~3x faster. However, it's probably 50x less stable at the moment. 😜
+and about ~2-3x faster. However, it's probably 50x less stable at the moment. 😜
 
 To whet your appetite, here's a memory profile between flaco, [connectorx](https://github.com/sfu-db/connector-x) 
 and `pandas.read_sql` on a table with 1M rows with columns of various types. 
 (see [test_benchmarks.py](benchmarks/test_benchmarks.py)) *If the data, 
-specifically integer types, has null values, you can expect a bit lower savings than the ~4x 
-you see here; therefore (hot tip 🔥), supply fill values in your queries where possible via `coalesce`.
+specifically integer types, has null values, you can expect a bit lower savings
+what you see here; therefore (hot tip 🔥), supply fill values in your queries 
+where possible via `coalesce`.
 
 ```bash
 Line #    Mem usage    Increment  Occurences   Line Contents
 ============================================================
-   118     98.3 MiB     98.3 MiB           1   @profile
+   118     97.9 MiB     97.9 MiB           1   @profile
    119                                         def memory_profile():
-   120     98.3 MiB      0.0 MiB           1       stmt = "select * from test_table"
-   121                                             # connectorx is a _very_ good alternative with better source support
-   122    363.0 MiB    264.6 MiB           1       _cx_df = cx.read_sql(DB_URI, stmt, return_type="pandas")
+   120     97.9 MiB      0.0 MiB           1       stmt = "select * from test_table"
+   121                                         
+   122    354.9 MiB    257.0 MiB           1       _cx_df = cx.read_sql(DB_URI, stmt, return_type="pandas")
    123                                         
-   124    363.0 MiB      0.0 MiB           1       with Database(DB_URI) as con:
-   125    622.6 MiB    259.6 MiB           1           data = read_sql(stmt, con, n_rows=1_000_000)
-   126    638.1 MiB     15.5 MiB           1           _flaco_df = pd.DataFrame(data, copy=False)
+   124    354.9 MiB      0.0 MiB           1       with Database(DB_URI) as con:
+   125    533.9 MiB    178.9 MiB           1           data = read_sql(stmt, con)
+   126    541.2 MiB      7.3 MiB           1           _flaco_df = pd.DataFrame(data, copy=False)
    127                                         
-   128    642.8 MiB      4.7 MiB           1       engine = create_engine(DB_URI)
-   129   1798.3 MiB   1155.5 MiB           1       _pandas_df = pd.read_sql(stmt, engine)
+   128    545.3 MiB      4.1 MiB           1       engine = create_engine(DB_URI)
+   129   1680.9 MiB   1135.5 MiB           1       _pandas_df = pd.read_sql(stmt, engine)
 ```
 
 ---
@@ -86,9 +87,9 @@ Connectorx is an _exceptionally_ impressive library, and more mature than flaco.
 They have much wider support for a range of data sources, while flaco only 
 supports postgres for now.
 
-Performance wise, benchmarking seems to indicate flaco is more performant in most (all?)
-datatypes aside from temporal types (datetime, date, time), where Connectorx seems to
-perform better.
+Performance wise, benchmarking seems to indicate flaco is generally more performant
+in terms of memory, but connectorx is faster when temporal data types (time, timestamp, etc)
+are used. If it's pure numeric dtypes, flaco is faster and more memory efficient.
 
 Connectorx [will make precheck queries](https://github.com/sfu-db/connector-x#how-does-connectorx-download-the-data)
 to the source before starting to download data. Depending on your application,
@@ -100,6 +101,9 @@ Flaco will not run _any_ precheck queries. _However_, you can supply either
 `n_rows` or `size_hint` to `flaco.io.read_sql` to give either exact, or a
 hint to reduce the number of allocations/resizing of arrays during data loading.
 
+**When in doubt, it's likely you should choose connectorx as it's more mature and
+offers great performance.**
+
 # Words of caution
 
 While it's pretty neat this lib can allow faster and less resource

diff --git a/benchmarks/test_benchmarks.py b/benchmarks/test_benchmarks.py
@@ -122,7 +122,7 @@ def memory_profile():
     _cx_df = cx.read_sql(DB_URI, stmt, return_type="pandas")
 
     with Database(DB_URI) as con:
-        data = read_sql(stmt, con, n_rows=1_000_000)
+        data = read_sql(stmt, con)
         _flaco_df = pd.DataFrame(data, copy=False)
 
     engine = create_engine(DB_URI)

diff --git a/flaco/flaco.h b/flaco/flaco.h
@@ -15,36 +15,26 @@ typedef struct {
 } BytesPtr;
 
 typedef struct {
-  int32_t year;
-  uint8_t month;
-  uint8_t day;
+  /**
+   * The value represents the number of days since January 1st, 2000.
+   */
+  int32_t offset;
 } DateInfo;
 
+typedef struct {
+  /**
+   * The value represents the number of microseconds since midnight, January 1st, 2000.
+   */
+  int64_t offset;
+} DateTimeInfo;
+
 typedef struct {
   uint8_t hour;
   uint8_t minute;
   uint8_t second;
   uint32_t usecond;
 } TimeInfo;
 
-typedef struct {
-  DateInfo date;
-  TimeInfo time;
-} DateTimeInfo;
-
-typedef struct {
-  int8_t hours;
-  int8_t minutes;
-  int8_t seconds;
-  bool is_positive;
-} TzInfo;
-
-typedef struct {
-  DateInfo date;
-  TimeInfo time;
-  TzInfo tz;
-} DateTimeTzInfo;
-
 typedef struct {
   char *ptr;
   uint32_t len;
@@ -54,7 +44,6 @@ typedef enum {
   Bytes,
   Date,
   DateTime,
-  DateTimeTz,
   Time,
   Boolean,
   Decimal,
@@ -81,10 +70,6 @@ typedef struct {
   DateTimeInfo _0;
 } DateTime_Body;
 
-typedef struct {
-  DateTimeTzInfo _0;
-} DateTimeTz_Body;
-
 typedef struct {
   TimeInfo _0;
 } Time_Body;
@@ -135,7 +120,6 @@ typedef struct {
     Bytes_Body bytes;
     Date_Body date;
     DateTime_Body date_time;
-    DateTimeTz_Body date_time_tz;
     Time_Body time;
     Boolean_Body boolean;
     Decimal_Body decimal;

diff --git a/flaco/includes.pxd b/flaco/includes.pxd
@@ -16,9 +16,7 @@ cdef extern from "./flaco.h":
         np.uint32_t len
 
     ctypedef struct DateInfo:
-        np.int8_t year
-        np.uint8_t month
-        np.uint8_t day
+        np.int32_t offset
 
     ctypedef struct TimeInfo:
         np.uint8_t hour
@@ -27,26 +25,13 @@ cdef extern from "./flaco.h":
         np.uint32_t usecond
 
     ctypedef struct DateTimeInfo:
-        DateInfo date
-        TimeInfo time
-
-    ctypedef struct TzInfo:
-        np.int8_t hours
-        np.int8_t minutes
-        np.int8_t seconds
-        bool is_positive
-
-    ctypedef struct DateTimeTzInfo:
-        DateInfo date
-        TimeInfo time
-        TzInfo tz
+        np.int64_t offset
 
     ctypedef enum Data_Tag:
         Bytes
         Boolean
         Date
         DateTime
-        DateTimeTz
         Time
         Decimal
         Int8
@@ -73,9 +58,6 @@ cdef extern from "./flaco.h":
     ctypedef struct DateTime_Body:
         DateTimeInfo _0
 
-    ctypedef struct DateTimeTz_Body:
-        DateTimeTzInfo _0
-
     ctypedef struct Time_Body:
         TimeInfo _0
 
@@ -113,7 +95,6 @@ cdef extern from "./flaco.h":
         Boolean_Body boolean
         Date_Body date
         DateTime_Body date_time
-        DateTimeTz_Body date_time_tz
         Time_Body time
         Decimal_Body decimal
         Int8_Body int8
@@ -129,6 +110,7 @@ cdef extern from "./flaco.h":
     ctypedef np.uint32_t *RowIteratorPtr
     ctypedef char **RowColumnNamesArrayPtr
     ctypedef np.uint32_t *RowDataArrayPtr
+    ctypedef np.uint32_t *SessionPtr
 
     DatabasePtr db_create(char *uri_ptr)
     void db_connect(DatabasePtr ptr, Exception *exc)

diff --git a/flaco/io.pyx b/flaco/io.pyx
@@ -17,6 +17,10 @@ cdef extern from "Python.h":
     object PyUnicode_InternFromString(char *v)
 
 
+cdef:
+    np.int64_t DATE_01_JAN_2000 = np.datetime64('2000-01-01', 'D').astype(np.int64)
+    np.int64_t DATETIME_MID_NIGHT_01_JAN_2000 = np.datetime64('2000-01-01T00:00:00', 'us').astype(np.int64)
+
 cpdef dict read_sql(str stmt, Database db, int n_rows=-1, int size_hint=-1):
     cdef:
         bytes stmt_bytes = stmt.encode("utf-8")
@@ -33,7 +37,13 @@ cpdef dict read_sql(str stmt, Database db, int n_rows=-1, int size_hint=-1):
         raise FlacoException(exc.decode())
 
     # Read first row
-    lib.next_row(&row_iterator, &row_data_array_ptr, &n_columns, &column_names, &exc)
+    lib.next_row(
+        &row_iterator,
+        &row_data_array_ptr,
+        &n_columns,
+        &column_names,
+        &exc
+    )
     if exc != NULL:
         raise FlacoException(exc.decode())
 
@@ -82,15 +92,20 @@ cpdef dict read_sql(str stmt, Database db, int n_rows=-1, int size_hint=-1):
 
             row_idx += one
 
-            lib.next_row(&row_iterator, &row_data_array_ptr, &n_columns, &column_names, &exc)
+            lib.next_row(
+                &row_iterator,
+                &row_data_array_ptr,
+                &n_columns,
+                &column_names,
+                &exc
+            )
             if exc != NULL:
                 raise FlacoException(exc.decode())
 
     # Ensure arrays are correct size; only if n_rows not set
     if _n_rows == -1 and current_array_len != row_idx:
         for i in range(0, n_columns):
             resize(output[i], row_idx)
-
     return {columns[i]: output[i] for i in range(n_columns)}
 
 cdef int resize(np.ndarray arr, int len) except -1:
@@ -135,11 +150,9 @@ cdef np.ndarray array_init(lib.Data data, int len):
     elif data.tag == lib.Data_Tag.Null:
         array = np.empty(shape=len, dtype=object)
     elif data.tag == lib.Data_Tag.Date:
-        array = np.empty(shape=len, dtype=dt.date)
+        array = np.empty(shape=len, dtype="datetime64[D]")
     elif data.tag == lib.Data_Tag.DateTime:
-        array = np.empty(shape=len, dtype=dt.datetime)
-    elif data.tag == lib.Data_Tag.DateTimeTz:
-        array = np.empty(shape=len, dtype=dt.datetime)
+        array = np.empty(shape=len, dtype="datetime64[us]")
     elif data.tag == lib.Data_Tag.Time:
         array = np.empty(shape=len, dtype=dt.time)
     else:
@@ -148,10 +161,9 @@ cdef np.ndarray array_init(lib.Data data, int len):
 
 
 cdef np.ndarray insert_data_into_array(lib.Data data, np.ndarray arr, int idx):
-    cdef np.ndarray[np.uint8_t, ndim=1] arr_bytes
-    cdef np.npy_intp intp
-    cdef dt.timedelta delta
-    cdef object tzinfo
+    cdef:
+        np.ndarray[np.uint8_t, ndim=1] arr_bytes
+        np.npy_intp intp
 
     if data.tag == lib.Data_Tag.Boolean:
         arr[idx] = data.boolean._0
@@ -186,43 +198,15 @@ cdef np.ndarray insert_data_into_array(lib.Data data, np.ndarray arr, int idx):
         free(data.string._0.ptr)
 
     elif data.tag == lib.Data_Tag.Date:
-        arr[idx] = dt.date_new(
-            data.date._0.year,
-            data.date._0.month,
-            data.date._0.day
+        arr[idx] = np.timedelta64(
+            DATE_01_JAN_2000 + data.date._0.offset,
+            'D'
         )
 
     elif data.tag == lib.Data_Tag.DateTime:
-        arr[idx] = dt.datetime_new(
-            data.date_time._0.date.year,
-            data.date_time._0.date.month,
-            data.date_time._0.date.day,
-            data.date_time._0.time.hour,
-            data.date_time._0.time.minute,
-            data.date_time._0.time.second,
-            data.date_time._0.time.usecond,
-            None
-        )
-
-    elif data.tag == lib.Data_Tag.DateTimeTz:
-        delta = dt.timedelta_new(
-            data.date_time_tz._0.tz.hours,
-            data.date_time_tz._0.tz.minutes,
-            data.date_time_tz._0.tz.seconds
-        )
-        if data.date_time_tz._0.tz.is_positive:
-            tzinfo = dt.timezone(delta)
-        else:
-            tzinfo = dt.timezone(-delta)
-        arr[idx] = dt.datetime_new(
-            data.date_time_tz._0.date.year,
-            data.date_time_tz._0.date.month,
-            data.date_time_tz._0.date.day,
-            data.date_time_tz._0.time.hour,
-            data.date_time_tz._0.time.minute,
-            data.date_time_tz._0.time.second,
-            data.date_time_tz._0.time.usecond,
-            tzinfo
+        arr[idx] = np.timedelta64(
+            DATETIME_MID_NIGHT_01_JAN_2000 + data.date_time._0.offset,
+            'us'
         )
 
     elif data.tag == lib.Data_Tag.Time: