From d681dc401ac6254e61ddf9829e5f392030b15156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Monlla=C3=B3?= Date: Thu, 27 Jun 2019 06:22:15 +0200 Subject: [PATCH 1/5] FIX: Error control for really small datasets We need samples labelled with all the different classes available for each of the evaluation iterations we have. Really small datasets (e.g. less than 10 samples) may not be able to provide all the different classes available when iterating through the multiple evaluation runs we have. We should skip those iterations as otherwise the provided data do not match the tensor size otherwise. --- moodlemlbackend/processor/estimator.py | 23 ++++++++++++++++++----- moodlemlbackend/version.py | 1 + 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/moodlemlbackend/processor/estimator.py b/moodlemlbackend/processor/estimator.py index cab2d12..55289e8 100644 --- a/moodlemlbackend/processor/estimator.py +++ b/moodlemlbackend/processor/estimator.py @@ -161,11 +161,13 @@ def get_metadata(filepath): for row in file_iterator: row_count += 1 if row_count == 1: - data_header = [x for x in csv.reader(row, delimiter=',', quotechar='"')][0] + data_header = [x for x in csv.reader( + row, delimiter=',', quotechar='"')][0] classes_index = data_header.index("targetclasses") features_index = data_header.index("nfeatures") if row_count == 2: - info_row = [x for x in csv.reader(row, delimiter=',', quotechar='"')][0] + info_row = [x for x in csv.reader( + row, delimiter=',', quotechar='"')][0] target_classes = json.loads(info_row[classes_index]) return { "n_classes": len(target_classes), @@ -223,7 +225,6 @@ def __init__(self, modelid, directory, dataset=None): raise OSError('Directory ' + self.tensor_logdir + ' can not be created.') - def get_classifier(self, X, y, initial_weights=False): """Gets the classifier""" @@ -357,6 +358,10 @@ def evaluate_dataset(self, filepath, min_score=0.6, X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2) + if len(np.unique(y_train)) < self.n_classes: + # We need the input data to match the expected size of the + # tensor. + continue classifier = self.train(X_train, y_train) @@ -375,7 +380,8 @@ def evaluate_dataset(self, filepath, min_score=0.6, if self.is_binary: logging.info("AUC: %.2f%%", result['auc']) - logging.info("AUC standard deviation: %.4f", result['auc_deviation']) + logging.info("AUC standard deviation: %.4f", + result['auc_deviation']) logging.info("Accuracy: %.2f%%", result['accuracy'] * 100) logging.info("Precision (predicted elements that are real): %.2f%%", result['precision'] * 100) @@ -490,12 +496,19 @@ def get_evaluation_results(self, min_score, accepted_deviation): avg_precision = np.mean(self.precisions) avg_recall = np.mean(self.recalls) avg_mcc = np.mean(self.mccs) + if len(self.aucs) > 0: + avg_aucs = np.mean(self.aucs) + else: + avg_aucs = 0 # MCC goes from -1 to 1 we need to transform it to a value between # 0 and 1 to compare it with the minimum score provided. score = (avg_mcc + 1) / 2 - acc_deviation = np.std(self.mccs) + if len(self.mccs) > 0: + acc_deviation = np.std(self.mccs) + else: + acc_deviation = 1 result = dict() if self.is_binary: result['auc'] = np.mean(self.aucs) diff --git a/moodlemlbackend/version.py b/moodlemlbackend/version.py index f404d2a..4d92bb4 100644 --- a/moodlemlbackend/version.py +++ b/moodlemlbackend/version.py @@ -2,6 +2,7 @@ import os + def print_version(): """Prints moodlemlbackend package version""" From 0c8752df2add1e141ce69da5690db53c0edc11ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Monlla=C3=B3?= Date: Mon, 8 Jul 2019 15:47:04 +0200 Subject: [PATCH 2/5] MDL-66004 Web front-end --- moodlemlbackend/processor/estimator.py | 4 - setup.py | 1 + webapp.py | 270 +++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 4 deletions(-) create mode 100644 webapp.py diff --git a/moodlemlbackend/processor/estimator.py b/moodlemlbackend/processor/estimator.py index 55289e8..e2d65f7 100644 --- a/moodlemlbackend/processor/estimator.py +++ b/moodlemlbackend/processor/estimator.py @@ -546,10 +546,6 @@ def get_evaluation_results(self, min_score, accepted_deviation): if acc_deviation > accepted_deviation and score < min_score: result['status'] = LOW_SCORE + NOT_ENOUGH_DATA - result['info'].append('Launch TensorBoard from command line by ' + - 'typing: tensorboard --logdir=\'' + - self.get_tensor_logdir() + '\'') - return result def store_classifier(self, trained_classifier): diff --git a/setup.py b/setup.py index 3a198fe..fc45924 100644 --- a/setup.py +++ b/setup.py @@ -55,5 +55,6 @@ 'scikit-learn>=0.21,<0.22', 'joblib>=0.13.0,<0.14', 'tensorflow>=1.14.0,<1.15', + 'flask>=1.0.2,<1.1', ], ) diff --git a/webapp.py b/webapp.py new file mode 100644 index 0000000..a671d69 --- /dev/null +++ b/webapp.py @@ -0,0 +1,270 @@ +import os +import re +import json +import tempfile +import zipfile +import shutil + +from flask import Flask, request, send_file, Response +from werkzeug.utils import secure_filename + +from moodlemlbackend.processor import estimator + +app = Flask(__name__) + + +def get_request_value(key, pattern=False, exception=True): + + if pattern is False: + pattern = '[^A-Za-z0-9_\-$]' + + value = request.values.get(key) + if value is None: + + if exception is True: + raise Exception('The requested key ' + key + ' is not available.') + return False + + return re.sub(pattern, '', value) + + +def get_model_dir(hashkey=False): + + basedir = os.environ["MOODLE_MLBACKEND_PYTHON_DIR"] + + if os.path.exists(basedir) is False: + raise IOError( + 'The base dir does not exist. ' + + 'Set env MOODLE_MLBACKEND_PYTHON_DIR to an existing dir') + + os.access(basedir, os.W_OK) + + uniquemodelid = get_request_value('uniqueid') + + # The dir in the server is namespaced by uniquemodelid and the + # dirhash (if present) which determines where the results should be stored. + modeldir = os.path.join(basedir, uniquemodelid) + + if hashkey is not False: + dirhash = get_request_value(hashkey) + modeldir = os.path.join(modeldir, dirhash) + + return modeldir + + +def check_access(): + + envvarname = "MOODLE_MLBACKEND_PYTHON_USERS" + if envvarname not in os.environ: + raise Exception( + envvarname + ' environment var is not set in the server.') + + if re.match(os.environ[envvarname], '[^A-Za-z0-9_\-,$]'): + raise Exception( + 'The value of ' + envvarname + ' environment var does not ' + + ' adhere to [^A-Za-z0-9_\-,$]') + + users = os.environ[envvarname].split(',') + + if (request.authorization is None or + request.authorization.username is None or + request.authorization.password is None): + return 'No user and/or password included in the request.' + + for user in users: + userdata = user.split(':') + if len(userdata) != 2: + raise Exception('Incorrect format for ' + + envvarname + ' environment var. It should ' + + 'contain a comma-separated list of ' + + 'username:password.') + + if (userdata[0] == request.authorization.username and + userdata[1] == request.authorization.password): + return True + + return 'Incorrect user and/or password provided by Moodle.' + + +def get_file_path(filekey): + + file = request.files[filekey] + + # We can use a temp directory for the input files. + tempdir = tempfile.mkdtemp() + filepath = os.path.join(tempdir, secure_filename(file.filename)) + file.save(filepath) + + return filepath + + +def zipdir(dirpath, zipfilepath): + + ziph = zipfile.ZipFile(zipfilepath, 'w', zipfile.ZIP_DEFLATED) + + for root, dirs, files in os.walk(dirpath): + for file in files: + abspath = os.path.join(root, file) + ziph.write(abspath, os.path.relpath(abspath, root)) + ziph.close() + return ziph + + +@app.route('/version', methods=['GET']) +def version(): + here = os.path.abspath(os.path.dirname(__file__)) + version_file = open(os.path.join(here, 'moodlemlbackend', 'VERSION')) + return version_file.read().strip() + + +@app.route('/training', methods=['POST']) +def training(): + + access = check_access() + if access is not True: + return access, 401 + + uniquemodelid = get_request_value('uniqueid') + outputdir = get_model_dir('outputdirhash') + + datasetpath = get_file_path('dataset') + + classifier = estimator.Classifier(uniquemodelid, outputdir) + result = classifier.train_dataset(datasetpath) + + return json.dumps(result) + + +@app.route('/prediction', methods=['POST']) +def prediction(): + + access = check_access() + if access is not True: + return access, 401 + + uniquemodelid = get_request_value('uniqueid') + outputdir = get_model_dir('outputdirhash') + + datasetpath = get_file_path('dataset') + + classifier = estimator.Classifier(uniquemodelid, outputdir) + result = classifier.predict_dataset(datasetpath) + + return json.dumps(result) + + +@app.route('/evaluation', methods=['POST']) +def evaluation(): + + access = check_access() + if access is not True: + return access, 401 + + uniquemodelid = get_request_value('uniqueid') + outputdir = get_model_dir('outputdirhash') + + minscore = get_request_value('minscore', pattern='[^0-9.$]') + maxdeviation = get_request_value('maxdeviation', pattern='[^0-9.$]') + niterations = get_request_value('niterations', pattern='[^0-9$]') + + datasetpath = get_file_path('dataset') + + trainedmodeldirhash = get_request_value( + 'trainedmodeldirhash', exception=False) + if trainedmodeldirhash is not False: + # The trained model dir in the server is namespaced by uniquemodelid + # and the trainedmodeldirhash which determines where should the results + # be stored. + trainedmodeldir = get_model_dir('trainedmodeldirhash') + else: + trainedmodeldir = False + + classifier = estimator.Classifier(uniquemodelid, outputdir) + result = classifier.evaluate_dataset(datasetpath, + float(minscore), + float(maxdeviation), + int(niterations), + trainedmodeldir) + + return json.dumps(result) + + +@app.route('/evaluationlog', methods=['GET']) +def evaluationlog(): + + access = check_access() + if access is not True: + return access, 401 + + outputdir = get_model_dir('outputdirhash') + runid = get_request_value('runid', '[^0-9$]') + logsdir = os.path.join(outputdir, 'logs', runid) + + zipfile = tempfile.NamedTemporaryFile() + zipdir(logsdir, zipfile) + return send_file(zipfile.name, mimetype='application/zip') + + +@app.route('/export', methods=['GET']) +def export(): + + access = check_access() + if access is not True: + return access, 401 + + uniquemodelid = get_request_value('uniqueid') + modeldir = get_model_dir('modeldirhash') + + # We can use a temp directory for the export data + # as we don't need to keep it forever. + tempdir = tempfile.mkdtemp() + + classifier = estimator.Classifier(uniquemodelid, modeldir) + exportdir = classifier.export_classifier(tempdir) + if exportdir is False: + return Response('There is nothing to export.', 503) + + zipfile = tempfile.NamedTemporaryFile() + zipdir(exportdir, zipfile) + return send_file(zipfile.name, mimetype='application/zip') + + +@app.route('/import', methods=['POST']) +def import_model(): + + access = check_access() + if access is not True: + return access, 401 + + uniquemodelid = get_request_value('uniqueid', '') + modeldir = get_model_dir('modeldirhash') + + importzippath = get_file_path('importzip') + + with zipfile.ZipFile(importzippath, 'r') as zipobject: + importtempdir = tempfile.mkdtemp() + zipobject.extractall(importtempdir) + + classifier = estimator.Classifier(uniquemodelid, modeldir) + classifier.import_classifier(importtempdir) + + return 'Ok', 200 + + +@app.route('/deletemodel', methods=['POST']) +def deletemodel(): + + access = check_access() + if access is not True: + return access, 401 + + modeldir = get_model_dir() + + if os.path.exists(modeldir): + # The directory may not exist. + shutil.rmtree(modeldir, False) + + return 'Ok', 200 + +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0') From b38e6dd94184db5f8d87d85a49aa9bf13f961c42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Monlla=C3=B3?= Date: Thu, 11 Jul 2019 08:49:09 +0200 Subject: [PATCH 3/5] MDL-66004: Support for S3 storage --- moodlemlbackend/import.py | 1 + moodlemlbackend/model/tensor.py | 1 + moodlemlbackend/processor/estimator.py | 4 +- moodlemlbackend/webapp/__init__.py | 0 moodlemlbackend/webapp/access.py | 52 ++++++ moodlemlbackend/webapp/localfs.py | 80 +++++++++ moodlemlbackend/webapp/s3.py | 164 ++++++++++++++++++ moodlemlbackend/webapp/util.py | 48 ++++++ setup.py | 1 + webapp.py | 230 +++++++------------------ 10 files changed, 413 insertions(+), 168 deletions(-) create mode 100644 moodlemlbackend/webapp/__init__.py create mode 100644 moodlemlbackend/webapp/access.py create mode 100644 moodlemlbackend/webapp/localfs.py create mode 100644 moodlemlbackend/webapp/s3.py create mode 100644 moodlemlbackend/webapp/util.py diff --git a/moodlemlbackend/import.py b/moodlemlbackend/import.py index 3b6c59d..6c49790 100644 --- a/moodlemlbackend/import.py +++ b/moodlemlbackend/import.py @@ -18,6 +18,7 @@ def import_classifier(): print('Ok') # An exception will be thrown before if it can be imported. + print('Ok') sys.exit(0) import_classifier() diff --git a/moodlemlbackend/model/tensor.py b/moodlemlbackend/model/tensor.py index b020982..e016772 100644 --- a/moodlemlbackend/model/tensor.py +++ b/moodlemlbackend/model/tensor.py @@ -83,6 +83,7 @@ def set_tensor_logdir(self, tensor_logdir): def build_graph(self, initial_weights=False): """Builds the computational graph without feeding any data in""" + # Placeholders for input values. with tf.name_scope('inputs'): self.x = tf.placeholder( diff --git a/moodlemlbackend/processor/estimator.py b/moodlemlbackend/processor/estimator.py index e2d65f7..5ee127a 100644 --- a/moodlemlbackend/processor/estimator.py +++ b/moodlemlbackend/processor/estimator.py @@ -552,7 +552,7 @@ def store_classifier(self, trained_classifier): """Stores the classifier and saves a checkpoint of the tensors state""" # Store the graph state. - saver = tf.train.Saver() + saver = tf.train.Saver(save_relative_paths=True) sess = trained_classifier.get_session() path = os.path.join(self.persistencedir, 'model.ckpt') @@ -576,7 +576,7 @@ def load_classifier(self, model_dir=False): classifier.set_tensor_logdir(self.get_tensor_logdir()) # Now restore the graph state. - saver = tf.train.Saver() + saver = tf.train.Saver(save_relative_paths=True) path = os.path.join(model_dir, 'model.ckpt') saver.restore(classifier.get_session(), path) return classifier diff --git a/moodlemlbackend/webapp/__init__.py b/moodlemlbackend/webapp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/moodlemlbackend/webapp/access.py b/moodlemlbackend/webapp/access.py new file mode 100644 index 0000000..269c85a --- /dev/null +++ b/moodlemlbackend/webapp/access.py @@ -0,0 +1,52 @@ +import os +import re + +from functools import wraps + +from flask import request + + +def check_access(f): + '''Checks the access to the route.''' + + @wraps(f) + def access_wrapper(*args, **kwargs): + + # Check that the environment var is properly set. + envvarname = "MOODLE_MLBACKEND_PYTHON_USERS" + if envvarname not in os.environ: + raise Exception( + envvarname + ' environment var is not set in the server.') + + if re.match(os.environ[envvarname], '[^A-Za-z0-9_\-,$]'): + raise Exception( + 'The value of ' + envvarname + ' environment var does not ' + + ' adhere to [^A-Za-z0-9_\-,$]') + + users = os.environ[envvarname].split(',') + + if (request.authorization is None or + request.authorization.username is None or + request.authorization.password is None): + # Response for the client. + return 'No user and/or password included in the request.', 401 + + for user in users: + userdata = user.split(':') + if len(userdata) != 2: + raise Exception('Incorrect format for ' + + envvarname + ' environment var. It should ' + + 'contain a comma-separated list of ' + + 'username:password.') + + if (userdata[0] == request.authorization.username and + userdata[1] == request.authorization.password): + + # If all good we return the return from 'f' passing the + # original list of params to it. + return f(*args, **kwargs) + + # Response for the client. + return 'Incorrect user and/or password provided by Moodle.', 401 + + return access_wrapper diff --git a/moodlemlbackend/webapp/localfs.py b/moodlemlbackend/webapp/localfs.py new file mode 100644 index 0000000..88e52da --- /dev/null +++ b/moodlemlbackend/webapp/localfs.py @@ -0,0 +1,80 @@ +import shutil +import os + +from functools import wraps, update_wrapper + +from moodlemlbackend.webapp.util import get_request_value + + +# We can not set LocalFS_setup_base_dir as a nested class because they +# have problems to access the outer class.''' + + +class LocalFS(object): + + def get_localbasedir(self): + if self.localbasedir is None: + raise Exception('localbasedir is not set') + + return self.localbasedir + + def set_localbasedir(self, basedir): + self.localbasedir = basedir + + def get_model_dir(self, hashkey, fetch_model=False): + '''Returns the model dir in the local fs for the provided key. + + fetch_model param is ignored here.''' + + uniquemodelid = get_request_value('uniqueid') + dirhash = get_request_value(hashkey) + + # The dir in the local filesystem is namespaced by uniquemodelid and + # the dirhash which determines where the results should be stored. + modeldir = os.path.join(self.get_localbasedir(), + uniquemodelid, dirhash) + + return modeldir + + def delete_dir(self): + + uniquemodelid = get_request_value('uniqueid') + + # All files related to this version of the model in moodle are in + # /uniquemodelid. + modeldir = os.path.join(self.get_localbasedir(), uniquemodelid) + + if os.path.exists(modeldir): + # The directory may not exist. + shutil.rmtree(modeldir, True) + + +class LocalFS_setup_base_dir(object): + + def __init__(self, storage, fetch_model, push_model): + '''Checks that the local directory is set in ENV. + + fetch_model and push_model are ignored in local_fs.''' + + self.storage = storage + + localbasedir = os.environ["MOODLE_MLBACKEND_PYTHON_DIR"] + + if os.path.exists(localbasedir) is False: + raise IOError( + 'The base dir does not exist. ' + + 'Set env MOODLE_MLBACKEND_PYTHON_DIR to an existing dir') + + os.access(localbasedir, os.W_OK) + + storage.set_localbasedir(localbasedir) + + def __call__(self, f): + + @wraps(f) + def wrapper(*args, **kwargs): + '''Execute the decorated function passing the call args.''' + + update_wrapper(self, f) + return f(*args, **kwargs) + return wrapper diff --git a/moodlemlbackend/webapp/s3.py b/moodlemlbackend/webapp/s3.py new file mode 100644 index 0000000..98f748f --- /dev/null +++ b/moodlemlbackend/webapp/s3.py @@ -0,0 +1,164 @@ +import logging +import shutil +import tempfile +import os +import zipfile + +from functools import wraps, update_wrapper + +import boto3 +from botocore.exceptions import ClientError + +from moodlemlbackend.webapp.util import get_request_value, zipdir + + +# This will be set overwritten below. +localbasedir = None + + +class S3(object): + + def get_localbasedir(self): + if self.localbasedir is None: + raise Exception('localbasedir is not set') + + return self.localbasedir + + def set_localbasedir(self, basedir): + self.localbasedir = basedir + + def get_model_dir(self, hashkey, fetch_model=False): + '''Returns the model dir in the local fs for the provided key. + + Syncs the model with S3 if required.''' + + # No need to include uniquemodelid here, because this is all lives in + # a temp dir in the local file system. + modeldir = os.path.join(self.get_localbasedir(), hashkey) + + if fetch_model: + + s3 = boto3.client('s3') + + # Download the files for the provided uniquemodelid + modelhash + bucketname = os.environ["MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME"] + objectkey = self.object_key(hashkey) + + # TODO Check if we should be using TemporaryFile instead. + classifierzip = tempfile.NamedTemporaryFile() + classifierdir = os.path.join(modeldir, 'classifier') + try: + s3.download_fileobj(bucketname, objectkey, classifierzip) + + if os.path.getsize(classifierzip.name) > 0: + with zipfile.ZipFile(classifierzip, 'r') as zipobject: + + # The classifier directory is automatically created in + # moodlemlbackend.estimator but we need to create it + # before that point as we want to copy the classifier + # from S3. + try: + os.makedirs(classifierdir) + except FileExistsError: + # It can exist in some cases. + pass + zipobject.extractall(classifierdir) + except ClientError: + # No worries, it may perfectly not exist. + pass + + return modeldir + + def delete_dir(self): + + s3 = boto3.client('s3') + + bucketname = os.environ["MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME"] + + # Objectkey will equal uniquemodelid so we delete all files matching + # uniquemodelid/ namespace. + objectkey = self.object_key(False) + for key in s3.listobjects(Bucket=bucketname, Prefix=objectkey + '/'): + key.delete() + + def object_key(self, hashkey=False): + + uniquemodelid = get_request_value('uniqueid') + + if hashkey is False: + return uniquemodelid + + dirhash = get_request_value(hashkey) + return os.path.join(uniquemodelid, dirhash) + + +class S3_setup_base_dir(object): + '''Sets the localbasedir to /tmp''' + + def __init__(self, storage, fetch_model, push_model): + '''Sets the base dir to a temp directory. + + It fetches the requested model from s3 if required.''' + + self.storage = storage + self.fetch_model = fetch_model + self.push_model = push_model + + # It is our responsibility to delete this directory. However, we are + # relying on the OS to delete it if there is any exception during the + # course of the request. + self.storage.set_localbasedir(tempfile.mkdtemp()) + + def __call__(self, f): + + @wraps(f) + def wrapper(*args, **kwargs): + '''Execute the decorated function. + + Upload the model to s3 if required.''' + + update_wrapper(self, f) + + self.modeldir = self.storage.get_model_dir( + 'dirhash', fetch_model=self.fetch_model) + + # Execute the requested action. + funcreturn = f(*args, **kwargs) + + if self.push_model is True: + # Push the model to s3. + + s3 = boto3.client('s3') + + classifierdir = os.path.join(self.modeldir, 'classifier') + + # Copy the classifier in the model dir to S3. + updatedclassifierzip = tempfile.NamedTemporaryFile() + zipdir(classifierdir, updatedclassifierzip) + + # We are only interested in the model we just trained. + bucketname = os.environ[ + "MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME"] + objectkey = self.storage.object_key('dirhash') + + # Upload to S3. + try: + s3.upload_file( + updatedclassifierzip.name, bucketname, objectkey) + except ClientError as e: + # We don't want the error details in moodle as they could + # contain sensitive information. + logging.error('Error uploading the model to S3: ' + str(e)) + return 'Can\'t upload classifier to S3.', 500 + + # TODO Think about copying the new logs to S3. + + # It is our responsibility to delete tmp directories created with + # mkdtemp + shutil.rmtree(self.storage.get_localbasedir(), True) + + # Now that the files are copied back to S3 we can return f's + # Response. + return funcreturn + + return wrapper diff --git a/moodlemlbackend/webapp/util.py b/moodlemlbackend/webapp/util.py new file mode 100644 index 0000000..3807dc0 --- /dev/null +++ b/moodlemlbackend/webapp/util.py @@ -0,0 +1,48 @@ +import re +import os +import zipfile +import tempfile +import shutil +import atexit + +from flask import request + + +def get_request_value(key, pattern=False, exception=True): + + if pattern is False: + pattern = '[^A-Za-z0-9_\-$]' + + value = request.values.get(key) + if value is None: + + if exception is True: + raise Exception('The requested key ' + key + ' is not available.') + return False + + return re.sub(pattern, '', value) + + +def get_file_path(localbasedir, filekey): + + file = request.files[filekey] + + tempdir = tempfile.mkdtemp() + tempfilepath = os.path.join(tempdir, filekey) + + atexit.register(shutil.rmtree, tempdir) + file.save(tempfilepath) + + return tempfilepath + + +def zipdir(dirpath, zipf): + + ziph = zipfile.ZipFile(zipf, 'w', zipfile.ZIP_DEFLATED) + + for root, dirs, files in os.walk(dirpath): + for file in files: + abspath = os.path.join(root, file) + ziph.write(abspath, os.path.relpath(abspath, root)) + ziph.close() + return ziph diff --git a/setup.py b/setup.py index fc45924..e7738e6 100644 --- a/setup.py +++ b/setup.py @@ -56,5 +56,6 @@ 'joblib>=0.13.0,<0.14', 'tensorflow>=1.14.0,<1.15', 'flask>=1.0.2,<1.1', + 'boto3>=1.9.0,<1.10', ], ) diff --git a/webapp.py b/webapp.py index a671d69..e9c97e9 100644 --- a/webapp.py +++ b/webapp.py @@ -1,113 +1,27 @@ import os -import re import json import tempfile import zipfile -import shutil -from flask import Flask, request, send_file, Response -from werkzeug.utils import secure_filename +from flask import Flask, send_file, Response from moodlemlbackend.processor import estimator -app = Flask(__name__) - - -def get_request_value(key, pattern=False, exception=True): - - if pattern is False: - pattern = '[^A-Za-z0-9_\-$]' - - value = request.values.get(key) - if value is None: - - if exception is True: - raise Exception('The requested key ' + key + ' is not available.') - return False - - return re.sub(pattern, '', value) - - -def get_model_dir(hashkey=False): - - basedir = os.environ["MOODLE_MLBACKEND_PYTHON_DIR"] - - if os.path.exists(basedir) is False: - raise IOError( - 'The base dir does not exist. ' + - 'Set env MOODLE_MLBACKEND_PYTHON_DIR to an existing dir') - - os.access(basedir, os.W_OK) - - uniquemodelid = get_request_value('uniqueid') - - # The dir in the server is namespaced by uniquemodelid and the - # dirhash (if present) which determines where the results should be stored. - modeldir = os.path.join(basedir, uniquemodelid) - - if hashkey is not False: - dirhash = get_request_value(hashkey) - modeldir = os.path.join(modeldir, dirhash) - - return modeldir - - -def check_access(): - - envvarname = "MOODLE_MLBACKEND_PYTHON_USERS" - if envvarname not in os.environ: - raise Exception( - envvarname + ' environment var is not set in the server.') - - if re.match(os.environ[envvarname], '[^A-Za-z0-9_\-,$]'): - raise Exception( - 'The value of ' + envvarname + ' environment var does not ' + - ' adhere to [^A-Za-z0-9_\-,$]') - - users = os.environ[envvarname].split(',') - - if (request.authorization is None or - request.authorization.username is None or - request.authorization.password is None): - return 'No user and/or password included in the request.' - - for user in users: - userdata = user.split(':') - if len(userdata) != 2: - raise Exception('Incorrect format for ' + - envvarname + ' environment var. It should ' + - 'contain a comma-separated list of ' + - 'username:password.') +from moodlemlbackend.webapp.localfs import LocalFS, LocalFS_setup_base_dir +from moodlemlbackend.webapp.s3 import S3, S3_setup_base_dir +from moodlemlbackend.webapp.access import check_access +from moodlemlbackend.webapp.util import get_request_value, get_file_path +from moodlemlbackend.webapp.util import zipdir - if (userdata[0] == request.authorization.username and - userdata[1] == request.authorization.password): - return True - - return 'Incorrect user and/or password provided by Moodle.' - - -def get_file_path(filekey): - - file = request.files[filekey] - - # We can use a temp directory for the input files. - tempdir = tempfile.mkdtemp() - filepath = os.path.join(tempdir, secure_filename(file.filename)) - file.save(filepath) - - return filepath - - -def zipdir(dirpath, zipfilepath): - - ziph = zipfile.ZipFile(zipfilepath, 'w', zipfile.ZIP_DEFLATED) +app = Flask(__name__) - for root, dirs, files in os.walk(dirpath): - for file in files: - abspath = os.path.join(root, file) - ziph.write(abspath, os.path.relpath(abspath, root)) - ziph.close() - return ziph +# S3 or the local file system depending on the presence of this ENV var. +if "MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME" in os.environ: + storage = S3() + setup_base_dir = S3_setup_base_dir +else: + storage = LocalFS() + setup_base_dir = LocalFS_setup_base_dir @app.route('/version', methods=['GET']) @@ -118,56 +32,50 @@ def version(): @app.route('/training', methods=['POST']) +@check_access +@setup_base_dir(storage, True, True) def training(): - access = check_access() - if access is not True: - return access, 401 - uniquemodelid = get_request_value('uniqueid') - outputdir = get_model_dir('outputdirhash') + modeldir = storage.get_model_dir('dirhash') - datasetpath = get_file_path('dataset') + datasetpath = get_file_path(storage.get_localbasedir(), 'dataset') - classifier = estimator.Classifier(uniquemodelid, outputdir) + classifier = estimator.Classifier(uniquemodelid, modeldir, datasetpath) result = classifier.train_dataset(datasetpath) return json.dumps(result) @app.route('/prediction', methods=['POST']) +@check_access +@setup_base_dir(storage, True, True) def prediction(): - access = check_access() - if access is not True: - return access, 401 - uniquemodelid = get_request_value('uniqueid') - outputdir = get_model_dir('outputdirhash') + modeldir = storage.get_model_dir('dirhash') - datasetpath = get_file_path('dataset') + datasetpath = get_file_path(storage.get_localbasedir(), 'dataset') - classifier = estimator.Classifier(uniquemodelid, outputdir) + classifier = estimator.Classifier(uniquemodelid, modeldir, datasetpath) result = classifier.predict_dataset(datasetpath) return json.dumps(result) @app.route('/evaluation', methods=['POST']) +@check_access +@setup_base_dir(storage, False, False) def evaluation(): - access = check_access() - if access is not True: - return access, 401 - uniquemodelid = get_request_value('uniqueid') - outputdir = get_model_dir('outputdirhash') + modeldir = storage.get_model_dir('dirhash') minscore = get_request_value('minscore', pattern='[^0-9.$]') maxdeviation = get_request_value('maxdeviation', pattern='[^0-9.$]') niterations = get_request_value('niterations', pattern='[^0-9$]') - datasetpath = get_file_path('dataset') + datasetpath = get_file_path(storage.get_localbasedir(), 'dataset') trainedmodeldirhash = get_request_value( 'trainedmodeldirhash', exception=False) @@ -175,96 +83,86 @@ def evaluation(): # The trained model dir in the server is namespaced by uniquemodelid # and the trainedmodeldirhash which determines where should the results # be stored. - trainedmodeldir = get_model_dir('trainedmodeldirhash') + trainedmodeldir = storage.get_model_dir( + 'trainedmodeldirhash', fetch_model=True) else: trainedmodeldir = False - classifier = estimator.Classifier(uniquemodelid, outputdir) + classifier = estimator.Classifier(uniquemodelid, modeldir, datasetpath) result = classifier.evaluate_dataset(datasetpath, - float(minscore), - float(maxdeviation), - int(niterations), - trainedmodeldir) + float(minscore), + float(maxdeviation), + int(niterations), + trainedmodeldir) return json.dumps(result) @app.route('/evaluationlog', methods=['GET']) +@check_access +@setup_base_dir(storage, True, False) def evaluationlog(): - access = check_access() - if access is not True: - return access, 401 - - outputdir = get_model_dir('outputdirhash') + modeldir = storage.get_model_dir('dirhash') runid = get_request_value('runid', '[^0-9$]') - logsdir = os.path.join(outputdir, 'logs', runid) + logsdir = os.path.join(modeldir, 'logs', runid) - zipfile = tempfile.NamedTemporaryFile() - zipdir(logsdir, zipfile) - return send_file(zipfile.name, mimetype='application/zip') + zipf = tempfile.NamedTemporaryFile() + zipdir(logsdir, zipf) + return send_file(zipf.name, mimetype='application/zip') @app.route('/export', methods=['GET']) +@check_access +@setup_base_dir(storage, True, False) def export(): - access = check_access() - if access is not True: - return access, 401 - uniquemodelid = get_request_value('uniqueid') - modeldir = get_model_dir('modeldirhash') + modeldir = storage.get_model_dir('dirhash') # We can use a temp directory for the export data # as we don't need to keep it forever. - tempdir = tempfile.mkdtemp() + tempdir = tempfile.TemporaryDirectory() classifier = estimator.Classifier(uniquemodelid, modeldir) - exportdir = classifier.export_classifier(tempdir) + exportdir = classifier.export_classifier(tempdir.name) if exportdir is False: return Response('There is nothing to export.', 503) - zipfile = tempfile.NamedTemporaryFile() - zipdir(exportdir, zipfile) - return send_file(zipfile.name, mimetype='application/zip') + zipf = tempfile.NamedTemporaryFile() + zipdir(exportdir, zipf) + + return send_file(zipf.name, mimetype='application/zip') @app.route('/import', methods=['POST']) +@check_access +@setup_base_dir(storage, False, True) def import_model(): - access = check_access() - if access is not True: - return access, 401 - - uniquemodelid = get_request_value('uniqueid', '') - modeldir = get_model_dir('modeldirhash') + uniquemodelid = get_request_value('uniqueid') + modeldir = storage.get_model_dir('dirhash') - importzippath = get_file_path('importzip') + importzippath = get_file_path(storage.get_localbasedir(), 'importzip') with zipfile.ZipFile(importzippath, 'r') as zipobject: - importtempdir = tempfile.mkdtemp() - zipobject.extractall(importtempdir) + importtempdir = tempfile.TemporaryDirectory() + zipobject.extractall(importtempdir.name) classifier = estimator.Classifier(uniquemodelid, modeldir) - classifier.import_classifier(importtempdir) + classifier.import_classifier(importtempdir.name) return 'Ok', 200 @app.route('/deletemodel', methods=['POST']) +@check_access +@setup_base_dir(storage, False, False) def deletemodel(): - - access = check_access() - if access is not True: - return access, 401 - - modeldir = get_model_dir() - - if os.path.exists(modeldir): - # The directory may not exist. - shutil.rmtree(modeldir, False) - + # All processing is delegated to delete_dir as it is file system dependant. + storage.delete_dir return 'Ok', 200 + if __name__ == '__main__': app.run(debug=True, host='0.0.0.0') From 531ca79564a8204eb50017439111de0792135893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Monlla=C3=B3?= Date: Wed, 18 Sep 2019 14:07:31 +0800 Subject: [PATCH 4/5] MDL-66004 Bump the package version --- moodlemlbackend/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moodlemlbackend/VERSION b/moodlemlbackend/VERSION index 7ec1d6d..ccbccc3 100644 --- a/moodlemlbackend/VERSION +++ b/moodlemlbackend/VERSION @@ -1 +1 @@ -2.1.0 +2.2.0 From 38c4ef41cf8bbcdd04b1c1f5d80aa439ac458a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Monlla=C3=B3?= Date: Thu, 3 Oct 2019 09:17:30 +0800 Subject: [PATCH 5/5] MDL-66004 Fix delete_dir for server-mode --- moodlemlbackend/VERSION | 2 +- moodlemlbackend/webapp/s3.py | 7 ++++--- webapp.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/moodlemlbackend/VERSION b/moodlemlbackend/VERSION index ccbccc3..c043eea 100644 --- a/moodlemlbackend/VERSION +++ b/moodlemlbackend/VERSION @@ -1 +1 @@ -2.2.0 +2.2.1 diff --git a/moodlemlbackend/webapp/s3.py b/moodlemlbackend/webapp/s3.py index 98f748f..55e8657 100644 --- a/moodlemlbackend/webapp/s3.py +++ b/moodlemlbackend/webapp/s3.py @@ -71,15 +71,16 @@ def get_model_dir(self, hashkey, fetch_model=False): def delete_dir(self): - s3 = boto3.client('s3') + s3 = boto3.resource('s3') bucketname = os.environ["MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME"] + bucket = s3.Bucket(bucketname) # Objectkey will equal uniquemodelid so we delete all files matching # uniquemodelid/ namespace. objectkey = self.object_key(False) - for key in s3.listobjects(Bucket=bucketname, Prefix=objectkey + '/'): - key.delete() + + bucket.objects.filter(Prefix=objectkey + '/').delete() def object_key(self, hashkey=False): diff --git a/webapp.py b/webapp.py index e9c97e9..f6661b1 100644 --- a/webapp.py +++ b/webapp.py @@ -160,7 +160,7 @@ def import_model(): @setup_base_dir(storage, False, False) def deletemodel(): # All processing is delegated to delete_dir as it is file system dependant. - storage.delete_dir + storage.delete_dir() return 'Ok', 200