Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support n_jobs, other changes #21

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ REGRESSION_SIZE = 1000000x50
KMEANS_SAMPLES = 1000000
KMEANS_FEATURES = 50
KMEANS_SIZE = $(KMEANS_SAMPLES)x$(KMEANS_FEATURES)
SVM_SAMPLES = 100000
SVM_SAMPLES = 10000
SVM_FEATURES = 100
SVM_SIZE = $(SVM_SAMPLES)x$(SVM_FEATURES)
LOGREG_SAMPLES = 100000
Expand All @@ -26,10 +26,16 @@ HOST = $(shell hostname)
# This makes the makefile exit on failed benchmarks. We pipe the
# benchmark outputs to "tee", which results in unexpected successes.
SHELL = bash -o pipefail
export USE_DAAL4PY_SKLEARN = yes

# Other options
NUM_THREADS = -1
SVM_NUM_THREADS = 0

ifeq ($(NUM_THREADS),1)
OMP_NUM_THREADS = $(NUM_THREADS)
export OMP_NUM_THREADS
endif
LOGREG_NUM_THREADS = $(SVM_NUM_THREADS)
DFCLF_NUM_THREADS = $(SVM_NUM_THREADS)
DFREG_NUM_THREADS = $(SVM_NUM_THREADS)
Expand Down Expand Up @@ -80,10 +86,12 @@ ARGS_NATIVE_kmeans = --num-threads "$(NUM_THREADS)" --header \
--filei data/clustering/kmeans_$(KMEANS_SIZE).init.npy
ARGS_NATIVE_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
--fileY data/two/y-$(SVM_SIZE).npy \
--num-threads $(SVM_NUM_THREADS) --header
--num-threads $(SVM_NUM_THREADS) --header \
--kernel rbf
ARGS_NATIVE_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
--fileY data/multi/y-$(SVM_SIZE).npy \
--num-threads $(SVM_NUM_THREADS) --header
--num-threads $(SVM_NUM_THREADS) --header \
--kernel rbf
ARGS_NATIVE_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \
--fileY data/two/y-$(LOGREG_SIZE).npy \
--num-threads $(LOGREG_NUM_THREADS) --header
Expand All @@ -106,6 +114,7 @@ SKLEARN_linear = linear
SKLEARN_pca_full = pca
SKLEARN_pca_daal = pca
SKLEARN_kmeans = kmeans
SKLEARN_dbscan = dbscan
SKLEARN_svm2 = svm
SKLEARN_svm5 = svm
SKLEARN_logreg2 = log_reg
Expand All @@ -122,10 +131,13 @@ ARGS_SKLEARN_pca_full = --size "$(REGRESSION_SIZE)" --svd-solver full
ARGS_SKLEARN_kmeans = --data-multiplier "$(MULTIPLIER)" \
--filex data/clustering/kmeans_$(KMEANS_SIZE).npy \
--filei data/clustering/kmeans_$(KMEANS_SIZE).init.npy
ARGS_SKLEARN_dbscan = --filex data/clustering/kmeans_$(KMEANS_SIZE).npy
ARGS_SKLEARN_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
--fileY data/two/y-$(SVM_SIZE).npy
--fileY data/two/y-$(SVM_SIZE).npy \
--kernel rbf
ARGS_SKLEARN_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
--fileY data/multi/y-$(SVM_SIZE).npy
--fileY data/multi/y-$(SVM_SIZE).npy \
--kernel rbf
ARGS_SKLEARN_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \
--fileY data/two/y-$(LOGREG_SIZE).npy
ARGS_SKLEARN_logreg5 = --fileX data/multi/X-$(LOGREG_SIZE).npy \
Expand Down Expand Up @@ -196,7 +208,7 @@ output/native/%.out: | DATA_% output/native/
native/bin/$(NATIVE_$*) $(ARGS_NATIVE_$*) | tee $@

output/sklearn/%.out: | DATA_% output/sklearn/
python sklearn/$(SKLEARN_$*).py $(COMMON_ARGS) $(ARGS_SKLEARN_$*) | tee $@
taskset -c 28-56 python sklearn/$(SKLEARN_$*).py $(COMMON_ARGS) $(ARGS_SKLEARN_$*) | tee $@

output/daal4py/%.out: | DATA_% output/daal4py/
python daal4py/$(DAAL4PY_$*).py $(COMMON_ARGS) $(ARGS_DAAL4PY_$*) | tee $@
Expand Down
3 changes: 2 additions & 1 deletion sklearn/df_clsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
'sklearn.ensemble.RandomForestClassifier')
parser.add_argument('--seed', type=int, default=12345,
help='Seed to pass as random_state to the class')
params = parse_args(parser, loop_types=('fit', 'predict'))
params = parse_args(parser, loop_types=('fit', 'predict'),
n_jobs_supported=True)

# Get some RandomForestClassifier
if params.use_sklearn_class:
Expand Down
3 changes: 2 additions & 1 deletion sklearn/df_regr.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
'sklearn.ensemble.RandomForestRegressor')
parser.add_argument('--seed', type=int, default=12345,
help='Seed to pass as random_state to the class')
params = parse_args(parser, loop_types=('fit', 'predict'))
params = parse_args(parser, loop_types=('fit', 'predict'),
n_jobs_supported=True)

# Get some RandomForestRegressor
if params.use_sklearn_class:
Expand Down
3 changes: 2 additions & 1 deletion sklearn/distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
'benchmark')
parser.add_argument('--metrics', nargs='*', default=['cosine', 'correlation'],
help='Metrics to test for pairwise_distances')
params = parse_args(parser, size=(1000, 150000), dtypes=('f8', 'f4'))
params = parse_args(parser, size=(1000, 150000), dtypes=('f8', 'f4'),
n_jobs_supported=True)

# Generate random data
X = np.random.rand(*params.shape).astype(params.dtype)
Expand Down
3 changes: 2 additions & 1 deletion sklearn/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
type=int, help='Data multiplier')
parser.add_argument('--maxiter', type=int, default=100,
help='Maximum number of iterations')
params = parse_args(parser, loop_types=('fit', 'predict'))
params = parse_args(parser, loop_types=('fit', 'predict'),
n_jobs_supported=True)

# Load generated data
X = np.load(params.filex)
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
action='store_false',
help="Don't fit intercept (assume data already centered)")
params = parse_args(parser, size=(1000000, 50), dtypes=('f8', 'f4'),
loop_types=('fit', 'predict'))
loop_types=('fit', 'predict'), n_jobs_supported=True)

# Generate random data
X = np.random.rand(*params.shape).astype(params.dtype)
Expand Down
9 changes: 8 additions & 1 deletion sklearn/log_reg.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@
help="Tolerance for solver. If solver == 'newton-cg', "
"then the default is 1e-3. Otherwise, the default "
"is 1e-10.")
params = parse_args(parser, loop_types=('fit', 'predict'))
parser.add_argument('--tile', type=int, default=100, help="Create (tile)x size dataset"
)
params = parse_args(parser, loop_types=('fit', 'predict'),
n_jobs_supported=True)

# Load generated data
X = np.load(params.filex.name)
Expand Down Expand Up @@ -71,6 +74,10 @@
verbose=params.verbose)
print_row(columns, params, function='LogReg.fit', time=fit_time)

X = np.tile(X, (params.tile, 1))
y = np.tile(y, params.tile)
params.size = size_str(X.shape)

predict_time, y_pred = time_mean_min(clf.predict, X,
outer_loops=params.predict_outer_loops,
inner_loops=params.predict_inner_loops,
Expand Down
19 changes: 14 additions & 5 deletions sklearn/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import argparse
from bench import parse_args, time_mean_min, print_header, print_row, size_str
import numpy as np
from sklearn.svm import SVC
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score


Expand Down Expand Up @@ -71,9 +71,15 @@ def get_optimal_cache_size(n_features, dtype=np.double, max_cache=64):
params.n_classes = len(np.unique(y))

# Create our C-SVM classifier
clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter,
cache_size=params.cache_size_mb, tol=params.tol,
shrinking=params.shrinking, gamma=params.gamma)
if params.using_daal or params.kernel != 'linear':
clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter,
cache_size=params.cache_size_mb, tol=params.tol,
shrinking=params.shrinking, gamma=params.gamma)
else:
clf = LinearSVC(C=params.C, max_iter=params.maxiter, tol=params.tol)

if params.verbose:
print(clf)

columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
'kernel', 'cache_size_mb', 'C', 'sv_len', 'n_classes', 'accuracy',
Expand All @@ -90,7 +96,10 @@ def get_optimal_cache_size(n_features, dtype=np.double, max_cache=64):
goal_outer_loops=params.fit_goal,
time_limit=params.fit_time_limit,
verbose=params.verbose)
params.sv_len = clf.support_.shape[0]
if hasattr(clf, 'support_'):
params.sv_len = clf.support_.shape[0]
else:
params.sv_len = '?'
print_row(columns, params, function='SVM.fit', time=fit_time)

predict_time, y_pred = time_mean_min(clf.predict, X,
Expand Down