diff --git a/Makefile b/Makefile index 43ab37031..49df0691e 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ REGRESSION_SIZE = 1000000x50 KMEANS_SAMPLES = 1000000 KMEANS_FEATURES = 50 KMEANS_SIZE = $(KMEANS_SAMPLES)x$(KMEANS_FEATURES) -SVM_SAMPLES = 100000 +SVM_SAMPLES = 10000 SVM_FEATURES = 100 SVM_SIZE = $(SVM_SAMPLES)x$(SVM_FEATURES) LOGREG_SAMPLES = 100000 @@ -26,10 +26,16 @@ HOST = $(shell hostname) # This makes the makefile exit on failed benchmarks. We pipe the # benchmark outputs to "tee", which results in unexpected successes. SHELL = bash -o pipefail +export USE_DAAL4PY_SKLEARN = yes # Other options NUM_THREADS = -1 SVM_NUM_THREADS = 0 + +ifeq ($(NUM_THREADS),1) + OMP_NUM_THREADS = $(NUM_THREADS) + export OMP_NUM_THREADS +endif LOGREG_NUM_THREADS = $(SVM_NUM_THREADS) DFCLF_NUM_THREADS = $(SVM_NUM_THREADS) DFREG_NUM_THREADS = $(SVM_NUM_THREADS) @@ -80,10 +86,12 @@ ARGS_NATIVE_kmeans = --num-threads "$(NUM_THREADS)" --header \ --filei data/clustering/kmeans_$(KMEANS_SIZE).init.npy ARGS_NATIVE_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \ --fileY data/two/y-$(SVM_SIZE).npy \ - --num-threads $(SVM_NUM_THREADS) --header + --num-threads $(SVM_NUM_THREADS) --header \ + --kernel rbf ARGS_NATIVE_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \ --fileY data/multi/y-$(SVM_SIZE).npy \ - --num-threads $(SVM_NUM_THREADS) --header + --num-threads $(SVM_NUM_THREADS) --header \ + --kernel rbf ARGS_NATIVE_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \ --fileY data/two/y-$(LOGREG_SIZE).npy \ --num-threads $(LOGREG_NUM_THREADS) --header @@ -106,6 +114,7 @@ SKLEARN_linear = linear SKLEARN_pca_full = pca SKLEARN_pca_daal = pca SKLEARN_kmeans = kmeans +SKLEARN_dbscan = dbscan SKLEARN_svm2 = svm SKLEARN_svm5 = svm SKLEARN_logreg2 = log_reg @@ -122,10 +131,13 @@ ARGS_SKLEARN_pca_full = --size "$(REGRESSION_SIZE)" --svd-solver full ARGS_SKLEARN_kmeans = --data-multiplier "$(MULTIPLIER)" \ --filex data/clustering/kmeans_$(KMEANS_SIZE).npy \ --filei data/clustering/kmeans_$(KMEANS_SIZE).init.npy +ARGS_SKLEARN_dbscan = --filex data/clustering/kmeans_$(KMEANS_SIZE).npy ARGS_SKLEARN_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \ - --fileY data/two/y-$(SVM_SIZE).npy + --fileY data/two/y-$(SVM_SIZE).npy \ + --kernel rbf ARGS_SKLEARN_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \ - --fileY data/multi/y-$(SVM_SIZE).npy + --fileY data/multi/y-$(SVM_SIZE).npy \ + --kernel rbf ARGS_SKLEARN_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \ --fileY data/two/y-$(LOGREG_SIZE).npy ARGS_SKLEARN_logreg5 = --fileX data/multi/X-$(LOGREG_SIZE).npy \ @@ -196,7 +208,7 @@ output/native/%.out: | DATA_% output/native/ native/bin/$(NATIVE_$*) $(ARGS_NATIVE_$*) | tee $@ output/sklearn/%.out: | DATA_% output/sklearn/ - python sklearn/$(SKLEARN_$*).py $(COMMON_ARGS) $(ARGS_SKLEARN_$*) | tee $@ + taskset -c 28-56 python sklearn/$(SKLEARN_$*).py $(COMMON_ARGS) $(ARGS_SKLEARN_$*) | tee $@ output/daal4py/%.out: | DATA_% output/daal4py/ python daal4py/$(DAAL4PY_$*).py $(COMMON_ARGS) $(ARGS_DAAL4PY_$*) | tee $@ diff --git a/sklearn/df_clsf.py b/sklearn/df_clsf.py index 0521a06d5..feeb0e9a8 100644 --- a/sklearn/df_clsf.py +++ b/sklearn/df_clsf.py @@ -29,7 +29,8 @@ 'sklearn.ensemble.RandomForestClassifier') parser.add_argument('--seed', type=int, default=12345, help='Seed to pass as random_state to the class') -params = parse_args(parser, loop_types=('fit', 'predict')) +params = parse_args(parser, loop_types=('fit', 'predict'), + n_jobs_supported=True) # Get some RandomForestClassifier if params.use_sklearn_class: diff --git a/sklearn/df_regr.py b/sklearn/df_regr.py index 2e27b9844..facc11236 100644 --- a/sklearn/df_regr.py +++ b/sklearn/df_regr.py @@ -27,7 +27,8 @@ 'sklearn.ensemble.RandomForestRegressor') parser.add_argument('--seed', type=int, default=12345, help='Seed to pass as random_state to the class') -params = parse_args(parser, loop_types=('fit', 'predict')) +params = parse_args(parser, loop_types=('fit', 'predict'), + n_jobs_supported=True) # Get some RandomForestRegressor if params.use_sklearn_class: diff --git a/sklearn/distances.py b/sklearn/distances.py index bcba78719..2d99a38ee 100644 --- a/sklearn/distances.py +++ b/sklearn/distances.py @@ -11,7 +11,8 @@ 'benchmark') parser.add_argument('--metrics', nargs='*', default=['cosine', 'correlation'], help='Metrics to test for pairwise_distances') -params = parse_args(parser, size=(1000, 150000), dtypes=('f8', 'f4')) +params = parse_args(parser, size=(1000, 150000), dtypes=('f8', 'f4'), + n_jobs_supported=True) # Generate random data X = np.random.rand(*params.shape).astype(params.dtype) diff --git a/sklearn/kmeans.py b/sklearn/kmeans.py index 4776f86fa..e71acc0ed 100644 --- a/sklearn/kmeans.py +++ b/sklearn/kmeans.py @@ -18,7 +18,8 @@ type=int, help='Data multiplier') parser.add_argument('--maxiter', type=int, default=100, help='Maximum number of iterations') -params = parse_args(parser, loop_types=('fit', 'predict')) +params = parse_args(parser, loop_types=('fit', 'predict'), + n_jobs_supported=True) # Load generated data X = np.load(params.filex) diff --git a/sklearn/linear.py b/sklearn/linear.py index f84ec4481..682a87607 100644 --- a/sklearn/linear.py +++ b/sklearn/linear.py @@ -13,7 +13,7 @@ action='store_false', help="Don't fit intercept (assume data already centered)") params = parse_args(parser, size=(1000000, 50), dtypes=('f8', 'f4'), - loop_types=('fit', 'predict')) + loop_types=('fit', 'predict'), n_jobs_supported=True) # Generate random data X = np.random.rand(*params.shape).astype(params.dtype) diff --git a/sklearn/log_reg.py b/sklearn/log_reg.py index 80dfa64be..4c002d0ba 100644 --- a/sklearn/log_reg.py +++ b/sklearn/log_reg.py @@ -35,7 +35,10 @@ help="Tolerance for solver. If solver == 'newton-cg', " "then the default is 1e-3. Otherwise, the default " "is 1e-10.") -params = parse_args(parser, loop_types=('fit', 'predict')) +parser.add_argument('--tile', type=int, default=100, help="Create (tile)x size dataset" + ) +params = parse_args(parser, loop_types=('fit', 'predict'), + n_jobs_supported=True) # Load generated data X = np.load(params.filex.name) @@ -71,6 +74,10 @@ verbose=params.verbose) print_row(columns, params, function='LogReg.fit', time=fit_time) +X = np.tile(X, (params.tile, 1)) +y = np.tile(y, params.tile) +params.size = size_str(X.shape) + predict_time, y_pred = time_mean_min(clf.predict, X, outer_loops=params.predict_outer_loops, inner_loops=params.predict_inner_loops, diff --git a/sklearn/svm.py b/sklearn/svm.py index f5fbbe8ab..6cb9f55b1 100644 --- a/sklearn/svm.py +++ b/sklearn/svm.py @@ -5,7 +5,7 @@ import argparse from bench import parse_args, time_mean_min, print_header, print_row, size_str import numpy as np -from sklearn.svm import SVC +from sklearn.svm import SVC, LinearSVC from sklearn.metrics import accuracy_score @@ -71,9 +71,15 @@ def get_optimal_cache_size(n_features, dtype=np.double, max_cache=64): params.n_classes = len(np.unique(y)) # Create our C-SVM classifier -clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, - cache_size=params.cache_size_mb, tol=params.tol, - shrinking=params.shrinking, gamma=params.gamma) +if params.using_daal or params.kernel != 'linear': + clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, + cache_size=params.cache_size_mb, tol=params.tol, + shrinking=params.shrinking, gamma=params.gamma) +else: + clf = LinearSVC(C=params.C, max_iter=params.maxiter, tol=params.tol) + +if params.verbose: + print(clf) columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'kernel', 'cache_size_mb', 'C', 'sv_len', 'n_classes', 'accuracy', @@ -90,7 +96,10 @@ def get_optimal_cache_size(n_features, dtype=np.double, max_cache=64): goal_outer_loops=params.fit_goal, time_limit=params.fit_time_limit, verbose=params.verbose) -params.sv_len = clf.support_.shape[0] +if hasattr(clf, 'support_'): + params.sv_len = clf.support_.shape[0] +else: + params.sv_len = '?' print_row(columns, params, function='SVM.fit', time=fit_time) predict_time, y_pred = time_mean_min(clf.predict, X,