From 3ee46952cd4b8481bda7503c628dbbfe89baaa2c Mon Sep 17 00:00:00 2001
From: Ariel Eizenberg <ariel.eizenberg@pagaya.com>
Date: Mon, 10 Jun 2024 23:45:56 +0300
Subject: [PATCH] Improve DMatrix creation performance in python

The xgboost python python package serializes numpy arrays as json.
This has non trivial overhead for small datasets.

This patch optimizes the specific case where the numpy is already in
"C" contigous 32-bit floating point format, and has rows*cols<=32768,
and loads it directly without the json layer.
xgboost/tests/python/microbench_numpy.py:

Threads  | Rows     | Cols     | Current (sec)   | Optimized (sec) | Ratio
       1 |        1 |     1000 |       0.0001921 |       0.0001703 |        88.6%
       1 |        4 |     1000 |       0.0001689 |       0.0001437 |        85.1%
       1 |       16 |     1000 |       0.0002639 |       0.0002457 |        93.1%
       1 |       64 |     1000 |       0.0006843 |       0.0006719 |        98.2%
       1 |      256 |     1000 |        0.002611 |        0.002655 |       101.7%
       1 |     1024 |     1000 |           0.013 |          0.0126 |        97.0%
       1 |     4096 |     1000 |         0.06081 |          0.0593 |        97.5%
       1 |    16384 |     1000 |          0.2981 |          0.2974 |        99.8%
       2 |        1 |     1000 |       0.0001415 |       0.0001196 |        84.6%
       2 |        4 |     1000 |       0.0002155 |       0.0002003 |        93.0%
       2 |       16 |     1000 |       0.0002137 |        0.000196 |        91.7%
       2 |       64 |     1000 |       0.0005054 |       0.0004855 |        96.1%
       2 |      256 |     1000 |        0.001613 |        0.001687 |       104.6%
       2 |     1024 |     1000 |        0.007743 |        0.008194 |       105.8%
       2 |     4096 |     1000 |         0.03791 |         0.03783 |        99.8%
       2 |    16384 |     1000 |          0.2077 |          0.2037 |        98.1%
       4 |        1 |     1000 |       0.0001374 |       0.0001237 |        90.0%
       4 |        4 |     1000 |       0.0001985 |       0.0001621 |        81.7%
       4 |       16 |     1000 |       0.0002266 |       0.0001988 |        87.7%
       4 |       64 |     1000 |       0.0005175 |       0.0004775 |        92.3%
       4 |      256 |     1000 |         0.00166 |        0.001594 |        96.0%
       4 |     1024 |     1000 |        0.008257 |        0.008097 |        98.1%
       4 |     4096 |     1000 |         0.03492 |          0.0354 |       101.4%
       4 |    16384 |     1000 |          0.1896 |          0.1897 |       100.0%
       8 |        1 |     1000 |       0.0001471 |       0.0001254 |        85.3%
       8 |        4 |     1000 |       0.0003609 |        0.000326 |        90.4%
       8 |       16 |     1000 |       0.0002651 |       0.0002217 |        83.6%
       8 |       64 |     1000 |       0.0003504 |       0.0003064 |        87.5%
       8 |      256 |     1000 |       0.0008264 |       0.0008729 |       105.6%
       8 |     1024 |     1000 |        0.003367 |        0.003127 |        92.9%
       8 |     4096 |     1000 |         0.01932 |         0.01799 |        93.1%
       8 |    16384 |     1000 |          0.1245 |          0.1208 |        97.0%
---
 python-package/xgboost/data.py   | 33 +++++++++++++------
 tests/python/microbench_numpy.py | 56 ++++++++++++++++++++++++++++++++
 tests/python/test_basic.py       | 41 +++++++++++++++++++++++
 3 files changed, 120 insertions(+), 10 deletions(-)
 create mode 100755 tests/python/microbench_numpy.py

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 7e0ae793ba6e..1865ddbb17af 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -252,17 +252,30 @@ def _from_numpy_array(
     _check_data_shape(data)
     data, _ = _ensure_np_dtype(data, data.dtype)
     handle = ctypes.c_void_p()
-    _check_call(
-        _LIB.XGDMatrixCreateFromDense(
-            _array_interface(data),
-            make_jcargs(
-                missing=float(missing),
-                nthread=int(nthread),
-                data_split_mode=int(data_split_mode),
-            ),
-            ctypes.byref(handle),
+    if isinstance(data, np.ndarray) and data.dtype == np.float32 and data.flags['C_CONTIGUOUS'] and data.size <= 32768:
+        _check_call(
+            _LIB.XGDMatrixCreateFromMat_omp(
+                data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                c_bst_ulong(data.shape[0]),
+                c_bst_ulong(data.shape[1]),
+                ctypes.c_float(missing),
+                ctypes.byref(handle),
+                ctypes.c_int(nthread),
+                ctypes.c_int(data_split_mode),
+            )
+        )
+    else:
+        _check_call(
+            _LIB.XGDMatrixCreateFromDense(
+                _array_interface(data),
+                make_jcargs(
+                    missing=float(missing),
+                    nthread=int(nthread),
+                    data_split_mode=int(data_split_mode),
+                ),
+                ctypes.byref(handle),
+            )
         )
-    )
     return handle, feature_names, feature_types
 
 
diff --git a/tests/python/microbench_numpy.py b/tests/python/microbench_numpy.py
new file mode 100755
index 000000000000..9e302b581fdc
--- /dev/null
+++ b/tests/python/microbench_numpy.py
@@ -0,0 +1,56 @@
+import numpy as np
+import xgboost as xgb
+from collections import defaultdict
+import timeit
+import ctypes
+from xgboost.core import _LIB, DataSplitMode
+from xgboost.data import _check_call, _array_interface, c_bst_ulong, make_jcargs
+
+def measure_create_dmatrix(rows, cols, nthread, use_optimization):
+    data =  np.random.randn(rows, cols).astype(np.float32)
+    data = np.ascontiguousarray(data)
+
+    handle = ctypes.c_void_p()
+    missing = np.nan
+
+    start = timeit.default_timer()
+    if use_optimization:
+        _LIB.XGDMatrixCreateFromMat_omp(
+            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            c_bst_ulong(data.shape[0]),
+            c_bst_ulong(data.shape[1]),
+            ctypes.c_float(missing),
+            ctypes.byref(handle),
+            ctypes.c_int(nthread),
+            ctypes.c_int(DataSplitMode.ROW),
+        )
+    else:
+        _LIB.XGDMatrixCreateFromDense(
+            _array_interface(data),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(DataSplitMode.ROW),
+            ),
+            ctypes.byref(handle),
+        )
+    end = timeit.default_timer()
+    return end - start
+
+COLS = 1000
+
+print(f"{'Threads':8} | {'Rows':8} | {'Cols':8} | {'Current (sec)':15} | {'Optimized (sec)':15} | {'Ratio':12}")
+
+for nthread in [1, 2, 4, 8]:
+    for rows in [1, 4, 16, 64, 256, 1024, 4096, 16384]:
+        repeats = 65536 // rows
+
+        current = 0
+        for i in range(repeats):
+            current += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=False)
+
+        optimized = 0
+        for i in range(repeats):
+            optimized += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=True)
+
+        print(f"{nthread:8} | {rows:8} | {COLS:8} | {current/repeats:15.4g} | {optimized/repeats:15.4g} | {optimized / current:12.1%}")
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index cdc571a916df..7d2ab9c57b8d 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -210,6 +210,47 @@ def test_dmatrix_numpy_init_omp(self):
             assert dm.num_row() == row
             assert dm.num_col() == cols
 
+    def _test_dmatrix_numpy_init_omp_contiguous(self, test_contiguous: bool):
+        rows = [1000, 11326, 15000]
+        cols = 50
+        for row in rows:
+            X = np.random.randn(row, cols)
+            y = np.random.randn(row).astype("f")
+
+            # Ensure data is contiguous
+            if test_contiguous:
+                X = np.ascontiguousarray(X).astype(np.float32)
+                y = np.ascontiguousarray(y).astype(np.float32)
+                assert X.flags['C_CONTIGUOUS']
+            else:
+                X = np.asfortranarray(X)
+                y = np.asfortranarray(y)
+                assert not X.flags['C_CONTIGUOUS']
+
+            dm = xgb.DMatrix(X, y, nthread=0)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+            dm = xgb.DMatrix(X, y, nthread=1)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+            dm = xgb.DMatrix(X, y, nthread=10)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+    def test_dmatrix_numpy_init_omp_contiguous(self):
+        return self._test_dmatrix_numpy_init_omp_contiguous(True)
+
+    def test_dmatrix_numpy_init_omp_not_contiguous(self):
+        return self._test_dmatrix_numpy_init_omp_contiguous(False)
+
     def test_cv(self):
         dm, _ = tm.load_agaricus(__file__)
         params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}