Improve DMatrix creation performance in python

The xgboost python python package serializes numpy arrays as json. This has non trivial overhead for small datasets. This patch optimizes the specific case where the numpy is already in "C" contigous 32-bit floating point format, and has rows*cols<=32768, and loads it directly without the json layer. xgboost/tests/python/microbench_numpy.py: Threads | Rows | Cols | Current (sec) | Optimized (sec) | Ratio 1 | 1 | 1000 | 0.0001921 | 0.0001703 | 88.6% 1 | 4 | 1000 | 0.0001689 | 0.0001437 | 85.1% 1 | 16 | 1000 | 0.0002639 | 0.0002457 | 93.1% 1 | 64 | 1000 | 0.0006843 | 0.0006719 | 98.2% 1 | 256 | 1000 | 0.002611 | 0.002655 | 101.7% 1 | 1024 | 1000 | 0.013 | 0.0126 | 97.0% 1 | 4096 | 1000 | 0.06081 | 0.0593 | 97.5% 1 | 16384 | 1000 | 0.2981 | 0.2974 | 99.8% 2 | 1 | 1000 | 0.0001415 | 0.0001196 | 84.6% 2 | 4 | 1000 | 0.0002155 | 0.0002003 | 93.0% 2 | 16 | 1000 | 0.0002137 | 0.000196 | 91.7% 2 | 64 | 1000 | 0.0005054 | 0.0004855 | 96.1% 2 | 256 | 1000 | 0.001613 | 0.001687 | 104.6% 2 | 1024 | 1000 | 0.007743 | 0.008194 | 105.8% 2 | 4096 | 1000 | 0.03791 | 0.03783 | 99.8% 2 | 16384 | 1000 | 0.2077 | 0.2037 | 98.1% 4 | 1 | 1000 | 0.0001374 | 0.0001237 | 90.0% 4 | 4 | 1000 | 0.0001985 | 0.0001621 | 81.7% 4 | 16 | 1000 | 0.0002266 | 0.0001988 | 87.7% 4 | 64 | 1000 | 0.0005175 | 0.0004775 | 92.3% 4 | 256 | 1000 | 0.00166 | 0.001594 | 96.0% 4 | 1024 | 1000 | 0.008257 | 0.008097 | 98.1% 4 | 4096 | 1000 | 0.03492 | 0.0354 | 101.4% 4 | 16384 | 1000 | 0.1896 | 0.1897 | 100.0% 8 | 1 | 1000 | 0.0001471 | 0.0001254 | 85.3% 8 | 4 | 1000 | 0.0003609 | 0.000326 | 90.4% 8 | 16 | 1000 | 0.0002651 | 0.0002217 | 83.6% 8 | 64 | 1000 | 0.0003504 | 0.0003064 | 87.5% 8 | 256 | 1000 | 0.0008264 | 0.0008729 | 105.6% 8 | 1024 | 1000 | 0.003367 | 0.003127 | 92.9% 8 | 4096 | 1000 | 0.01932 | 0.01799 | 93.1% 8 | 16384 | 1000 | 0.1245 | 0.1208 | 97.0%
dmlc · Jun 11, 2024 · cdac476 · cdac476
1 parent 0c44067
commit cdac476
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 10 deletions.
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
@@ -252,17 +252,30 @@ def _from_numpy_array(
     _check_data_shape(data)
     data, _ = _ensure_np_dtype(data, data.dtype)
     handle = ctypes.c_void_p()
-    _check_call(
-        _LIB.XGDMatrixCreateFromDense(
-            _array_interface(data),
-            make_jcargs(
-                missing=float(missing),
-                nthread=int(nthread),
-                data_split_mode=int(data_split_mode),
-            ),
-            ctypes.byref(handle),
+    if isinstance(data, np.ndarray) and data.dtype == np.float32 and data.flags['C_CONTIGUOUS'] and data.size <= 32768:
+        _check_call(
+            _LIB.XGDMatrixCreateFromMat_omp(
+                data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                c_bst_ulong(data.shape[0]),
+                c_bst_ulong(data.shape[1]),
+                ctypes.c_float(float(missing)),
+                ctypes.byref(handle),
+                ctypes.c_int(nthread),
+                ctypes.c_int(data_split_mode),
+            )
+        )
+    else:
+        _check_call(
+            _LIB.XGDMatrixCreateFromDense(
+                _array_interface(data),
+                make_jcargs(
+                    missing=float(missing),
+                    nthread=int(nthread),
+                    data_split_mode=int(data_split_mode),
+                ),
+                ctypes.byref(handle),
+            )
         )
-    )
     return handle, feature_names, feature_types
 
 

diff --git a/tests/python/microbench_numpy.py b/tests/python/microbench_numpy.py
@@ -0,0 +1,56 @@
+import numpy as np
+import xgboost as xgb
+from collections import defaultdict
+import timeit
+import ctypes
+from xgboost.core import _LIB, DataSplitMode
+from xgboost.data import _check_call, _array_interface, c_bst_ulong, make_jcargs
+
+def measure_create_dmatrix(rows, cols, nthread, use_optimization):
+    data =  np.random.randn(rows, cols).astype(np.float32)
+    data = np.ascontiguousarray(data)
+
+    handle = ctypes.c_void_p()
+    missing = np.nan
+
+    start = timeit.default_timer()
+    if use_optimization:
+        _LIB.XGDMatrixCreateFromMat_omp(
+            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            c_bst_ulong(data.shape[0]),
+            c_bst_ulong(data.shape[1]),
+            ctypes.c_float(missing),
+            ctypes.byref(handle),
+            ctypes.c_int(nthread),
+            ctypes.c_int(DataSplitMode.ROW),
+        )
+    else:
+        _LIB.XGDMatrixCreateFromDense(
+            _array_interface(data),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(DataSplitMode.ROW),
+            ),
+            ctypes.byref(handle),
+        )
+    end = timeit.default_timer()
+    return end - start
+
+COLS = 1000
+
+print(f"{'Threads':8} | {'Rows':8} | {'Cols':8} | {'Current (sec)':15} | {'Optimized (sec)':15} | {'Ratio':12}")
+
+for nthread in [1, 2, 4, 8]:
+    for rows in [1, 4, 16, 64, 256, 1024, 4096, 16384]:
+        repeats = 65536 // rows
+
+        current = 0
+        for i in range(repeats):
+            current += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=False)
+
+        optimized = 0
+        for i in range(repeats):
+            optimized += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=True)
+
+        print(f"{nthread:8} | {rows:8} | {COLS:8} | {current/repeats:15.4g} | {optimized/repeats:15.4g} | {optimized / current:12.1%}")
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
@@ -210,6 +210,47 @@ def test_dmatrix_numpy_init_omp(self):
             assert dm.num_row() == row
             assert dm.num_col() == cols
 
+    def _test_dmatrix_numpy_init_omp_contiguous(self, test_contiguous: bool):
+        rows = [1000, 11326, 15000]
+        cols = 50
+        for row in rows:
+            X = np.random.randn(row, cols)
+            y = np.random.randn(row).astype("f")
+
+            # Ensure data is contiguous
+            if test_contiguous:
+                X = np.ascontiguousarray(X).astype(np.float32)
+                y = np.ascontiguousarray(y).astype(np.float32)
+                assert X.flags['C_CONTIGUOUS']
+            else:
+                X = np.asfortranarray(X)
+                y = np.asfortranarray(y)
+                assert not X.flags['C_CONTIGUOUS']
+
+            dm = xgb.DMatrix(X, y, nthread=0)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+            dm = xgb.DMatrix(X, y, nthread=1)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+            dm = xgb.DMatrix(X, y, nthread=10)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+    def test_dmatrix_numpy_init_omp_contiguous(self):
+        return self._test_dmatrix_numpy_init_omp_contiguous(True)
+
+    def test_dmatrix_numpy_init_omp_not_contiguous(self):
+        return self._test_dmatrix_numpy_init_omp_contiguous(False)
+
     def test_cv(self):
         dm, _ = tm.load_agaricus(__file__)
         params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}