-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve DMatrix creation performance in python
The xgboost python python package serializes numpy arrays as json. This can take up a considerable amount of time in production workloads. This patch optimizes the specific case where the numpy is already in "C" contigous 32-bit floating point format, and can be loaded directly without the json layer. This can improve performance up to 35% in some cases, as can be seen by the microbenchmark added in xgboost/tests/python/microbench_numpy.py: Rows | Cols | Threads | Contiguous | Non-contiguous | Ratio ---------+----------+--------------+-----------------+-----------------+-------------- 15000 | 100 | 0 | 0.01686 | 0.01988 | 84.8% 15000 | 100 | 1 | 0.02897 | 0.04424 | 65.5% 15000 | 100 | 2 | 0.02579 | 0.0392 | 65.8% 15000 | 100 | 10 | 0.01581 | 0.02058 | 76.8% ---------+----------+--------------+-----------------+-----------------+-------------- 2 | 2000 | 0 | 0.001055 | 0.001205 | 87.6% 2 | 2000 | 1 | 0.0004465 | 0.0005689 | 78.5% 2 | 2000 | 2 | 0.0004609 | 0.000615 | 74.9% 2 | 2000 | 10 | 0.0005087 | 0.0005623 | 90.5% ---------+----------+--------------+-----------------+-----------------+--------------
- Loading branch information
1 parent
0c44067
commit 2e3adc3
Showing
3 changed files
with
130 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import numpy as np | ||
import xgboost as xgb | ||
from collections import defaultdict | ||
import timeit | ||
|
||
|
||
def create_dmatrix(X, y, test_contiguous, nthread): | ||
if test_contiguous: | ||
X = np.ascontiguousarray(X).astype(np.float32) | ||
y = np.ascontiguousarray(y).astype(np.float32) | ||
assert X.flags["C_CONTIGUOUS"] | ||
else: | ||
X = np.asfortranarray(X) | ||
y = np.asfortranarray(y) | ||
assert not X.flags["C_CONTIGUOUS"] | ||
|
||
dm = xgb.DMatrix(X, y, nthread=nthread) | ||
return dm | ||
|
||
|
||
def benchmark_dmatrix_creation(test_contiguous, nthread, rows, cols): | ||
X = np.random.randn(rows, cols) | ||
y = np.random.randn( | ||
rows, | ||
).astype(np.float32) | ||
|
||
start_time = timeit.default_timer() | ||
dm = create_dmatrix(X, y, test_contiguous, nthread) | ||
end_time = timeit.default_timer() | ||
|
||
np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7) | ||
np.testing.assert_array_equal(dm.get_label(), y) | ||
assert dm.num_row() == rows | ||
assert dm.num_col() == cols | ||
|
||
total_time = end_time - start_time | ||
|
||
return total_time | ||
|
||
|
||
REPEATS = 10 | ||
|
||
contiguous = defaultdict(float) | ||
noncontiguous = defaultdict(float) | ||
|
||
print( | ||
f"{'Rows':8} | {'Cols':8} | {'Threads':12} | {'Contiguous':15} | {'Non-contiguous':15} | {'Ratio':12}" | ||
) | ||
|
||
for rows, cols, repeats in ((15000, 100, 10), (2, 2000, 200)): | ||
for nthread in (0, 1, 2, 10): | ||
for _ in range(repeats): | ||
contiguous[nthread] += benchmark_dmatrix_creation( | ||
test_contiguous=True, nthread=nthread, rows=rows, cols=cols | ||
) | ||
noncontiguous[nthread] += benchmark_dmatrix_creation( | ||
test_contiguous=False, nthread=nthread, rows=rows, cols=cols | ||
) | ||
|
||
contiguous = {k: v / repeats for k, v in contiguous.items()} | ||
noncontiguous = {k: v / repeats for k, v in noncontiguous.items()} | ||
|
||
for k in contiguous: | ||
print( | ||
f"{rows:8} | {cols:8} | {k:12} | {contiguous[k]:15.4g} | {noncontiguous[k]:15.4g} | {contiguous[k] / noncontiguous[k]:12.1%}" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters