diff --git a/moodlemlbackend/VERSION b/moodlemlbackend/VERSION index 7ec1d6d..c043eea 100644 --- a/moodlemlbackend/VERSION +++ b/moodlemlbackend/VERSION @@ -1 +1 @@ -2.1.0 +2.2.1 diff --git a/moodlemlbackend/import.py b/moodlemlbackend/import.py index 3b6c59d..6c49790 100644 --- a/moodlemlbackend/import.py +++ b/moodlemlbackend/import.py @@ -18,6 +18,7 @@ def import_classifier(): print('Ok') # An exception will be thrown before if it can be imported. + print('Ok') sys.exit(0) import_classifier() diff --git a/moodlemlbackend/model/tensor.py b/moodlemlbackend/model/tensor.py index b020982..e016772 100644 --- a/moodlemlbackend/model/tensor.py +++ b/moodlemlbackend/model/tensor.py @@ -83,6 +83,7 @@ def set_tensor_logdir(self, tensor_logdir): def build_graph(self, initial_weights=False): """Builds the computational graph without feeding any data in""" + # Placeholders for input values. with tf.name_scope('inputs'): self.x = tf.placeholder( diff --git a/moodlemlbackend/processor/estimator.py b/moodlemlbackend/processor/estimator.py index cab2d12..5ee127a 100644 --- a/moodlemlbackend/processor/estimator.py +++ b/moodlemlbackend/processor/estimator.py @@ -161,11 +161,13 @@ def get_metadata(filepath): for row in file_iterator: row_count += 1 if row_count == 1: - data_header = [x for x in csv.reader(row, delimiter=',', quotechar='"')][0] + data_header = [x for x in csv.reader( + row, delimiter=',', quotechar='"')][0] classes_index = data_header.index("targetclasses") features_index = data_header.index("nfeatures") if row_count == 2: - info_row = [x for x in csv.reader(row, delimiter=',', quotechar='"')][0] + info_row = [x for x in csv.reader( + row, delimiter=',', quotechar='"')][0] target_classes = json.loads(info_row[classes_index]) return { "n_classes": len(target_classes), @@ -223,7 +225,6 @@ def __init__(self, modelid, directory, dataset=None): raise OSError('Directory ' + self.tensor_logdir + ' can not be created.') - def get_classifier(self, X, y, initial_weights=False): """Gets the classifier""" @@ -357,6 +358,10 @@ def evaluate_dataset(self, filepath, min_score=0.6, X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2) + if len(np.unique(y_train)) < self.n_classes: + # We need the input data to match the expected size of the + # tensor. + continue classifier = self.train(X_train, y_train) @@ -375,7 +380,8 @@ def evaluate_dataset(self, filepath, min_score=0.6, if self.is_binary: logging.info("AUC: %.2f%%", result['auc']) - logging.info("AUC standard deviation: %.4f", result['auc_deviation']) + logging.info("AUC standard deviation: %.4f", + result['auc_deviation']) logging.info("Accuracy: %.2f%%", result['accuracy'] * 100) logging.info("Precision (predicted elements that are real): %.2f%%", result['precision'] * 100) @@ -490,12 +496,19 @@ def get_evaluation_results(self, min_score, accepted_deviation): avg_precision = np.mean(self.precisions) avg_recall = np.mean(self.recalls) avg_mcc = np.mean(self.mccs) + if len(self.aucs) > 0: + avg_aucs = np.mean(self.aucs) + else: + avg_aucs = 0 # MCC goes from -1 to 1 we need to transform it to a value between # 0 and 1 to compare it with the minimum score provided. score = (avg_mcc + 1) / 2 - acc_deviation = np.std(self.mccs) + if len(self.mccs) > 0: + acc_deviation = np.std(self.mccs) + else: + acc_deviation = 1 result = dict() if self.is_binary: result['auc'] = np.mean(self.aucs) @@ -533,17 +546,13 @@ def get_evaluation_results(self, min_score, accepted_deviation): if acc_deviation > accepted_deviation and score < min_score: result['status'] = LOW_SCORE + NOT_ENOUGH_DATA - result['info'].append('Launch TensorBoard from command line by ' + - 'typing: tensorboard --logdir=\'' + - self.get_tensor_logdir() + '\'') - return result def store_classifier(self, trained_classifier): """Stores the classifier and saves a checkpoint of the tensors state""" # Store the graph state. - saver = tf.train.Saver() + saver = tf.train.Saver(save_relative_paths=True) sess = trained_classifier.get_session() path = os.path.join(self.persistencedir, 'model.ckpt') @@ -567,7 +576,7 @@ def load_classifier(self, model_dir=False): classifier.set_tensor_logdir(self.get_tensor_logdir()) # Now restore the graph state. - saver = tf.train.Saver() + saver = tf.train.Saver(save_relative_paths=True) path = os.path.join(model_dir, 'model.ckpt') saver.restore(classifier.get_session(), path) return classifier diff --git a/moodlemlbackend/version.py b/moodlemlbackend/version.py index f404d2a..4d92bb4 100644 --- a/moodlemlbackend/version.py +++ b/moodlemlbackend/version.py @@ -2,6 +2,7 @@ import os + def print_version(): """Prints moodlemlbackend package version""" diff --git a/moodlemlbackend/webapp/__init__.py b/moodlemlbackend/webapp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/moodlemlbackend/webapp/access.py b/moodlemlbackend/webapp/access.py new file mode 100644 index 0000000..269c85a --- /dev/null +++ b/moodlemlbackend/webapp/access.py @@ -0,0 +1,52 @@ +import os +import re + +from functools import wraps + +from flask import request + + +def check_access(f): + '''Checks the access to the route.''' + + @wraps(f) + def access_wrapper(*args, **kwargs): + + # Check that the environment var is properly set. + envvarname = "MOODLE_MLBACKEND_PYTHON_USERS" + if envvarname not in os.environ: + raise Exception( + envvarname + ' environment var is not set in the server.') + + if re.match(os.environ[envvarname], '[^A-Za-z0-9_\-,$]'): + raise Exception( + 'The value of ' + envvarname + ' environment var does not ' + + ' adhere to [^A-Za-z0-9_\-,$]') + + users = os.environ[envvarname].split(',') + + if (request.authorization is None or + request.authorization.username is None or + request.authorization.password is None): + # Response for the client. + return 'No user and/or password included in the request.', 401 + + for user in users: + userdata = user.split(':') + if len(userdata) != 2: + raise Exception('Incorrect format for ' + + envvarname + ' environment var. It should ' + + 'contain a comma-separated list of ' + + 'username:password.') + + if (userdata[0] == request.authorization.username and + userdata[1] == request.authorization.password): + + # If all good we return the return from 'f' passing the + # original list of params to it. + return f(*args, **kwargs) + + # Response for the client. + return 'Incorrect user and/or password provided by Moodle.', 401 + + return access_wrapper diff --git a/moodlemlbackend/webapp/localfs.py b/moodlemlbackend/webapp/localfs.py new file mode 100644 index 0000000..88e52da --- /dev/null +++ b/moodlemlbackend/webapp/localfs.py @@ -0,0 +1,80 @@ +import shutil +import os + +from functools import wraps, update_wrapper + +from moodlemlbackend.webapp.util import get_request_value + + +# We can not set LocalFS_setup_base_dir as a nested class because they +# have problems to access the outer class.''' + + +class LocalFS(object): + + def get_localbasedir(self): + if self.localbasedir is None: + raise Exception('localbasedir is not set') + + return self.localbasedir + + def set_localbasedir(self, basedir): + self.localbasedir = basedir + + def get_model_dir(self, hashkey, fetch_model=False): + '''Returns the model dir in the local fs for the provided key. + + fetch_model param is ignored here.''' + + uniquemodelid = get_request_value('uniqueid') + dirhash = get_request_value(hashkey) + + # The dir in the local filesystem is namespaced by uniquemodelid and + # the dirhash which determines where the results should be stored. + modeldir = os.path.join(self.get_localbasedir(), + uniquemodelid, dirhash) + + return modeldir + + def delete_dir(self): + + uniquemodelid = get_request_value('uniqueid') + + # All files related to this version of the model in moodle are in + # /uniquemodelid. + modeldir = os.path.join(self.get_localbasedir(), uniquemodelid) + + if os.path.exists(modeldir): + # The directory may not exist. + shutil.rmtree(modeldir, True) + + +class LocalFS_setup_base_dir(object): + + def __init__(self, storage, fetch_model, push_model): + '''Checks that the local directory is set in ENV. + + fetch_model and push_model are ignored in local_fs.''' + + self.storage = storage + + localbasedir = os.environ["MOODLE_MLBACKEND_PYTHON_DIR"] + + if os.path.exists(localbasedir) is False: + raise IOError( + 'The base dir does not exist. ' + + 'Set env MOODLE_MLBACKEND_PYTHON_DIR to an existing dir') + + os.access(localbasedir, os.W_OK) + + storage.set_localbasedir(localbasedir) + + def __call__(self, f): + + @wraps(f) + def wrapper(*args, **kwargs): + '''Execute the decorated function passing the call args.''' + + update_wrapper(self, f) + return f(*args, **kwargs) + return wrapper diff --git a/moodlemlbackend/webapp/s3.py b/moodlemlbackend/webapp/s3.py new file mode 100644 index 0000000..55e8657 --- /dev/null +++ b/moodlemlbackend/webapp/s3.py @@ -0,0 +1,165 @@ +import logging +import shutil +import tempfile +import os +import zipfile + +from functools import wraps, update_wrapper + +import boto3 +from botocore.exceptions import ClientError + +from moodlemlbackend.webapp.util import get_request_value, zipdir + + +# This will be set overwritten below. +localbasedir = None + + +class S3(object): + + def get_localbasedir(self): + if self.localbasedir is None: + raise Exception('localbasedir is not set') + + return self.localbasedir + + def set_localbasedir(self, basedir): + self.localbasedir = basedir + + def get_model_dir(self, hashkey, fetch_model=False): + '''Returns the model dir in the local fs for the provided key. + + Syncs the model with S3 if required.''' + + # No need to include uniquemodelid here, because this is all lives in + # a temp dir in the local file system. + modeldir = os.path.join(self.get_localbasedir(), hashkey) + + if fetch_model: + + s3 = boto3.client('s3') + + # Download the files for the provided uniquemodelid + modelhash + bucketname = os.environ["MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME"] + objectkey = self.object_key(hashkey) + + # TODO Check if we should be using TemporaryFile instead. + classifierzip = tempfile.NamedTemporaryFile() + classifierdir = os.path.join(modeldir, 'classifier') + try: + s3.download_fileobj(bucketname, objectkey, classifierzip) + + if os.path.getsize(classifierzip.name) > 0: + with zipfile.ZipFile(classifierzip, 'r') as zipobject: + + # The classifier directory is automatically created in + # moodlemlbackend.estimator but we need to create it + # before that point as we want to copy the classifier + # from S3. + try: + os.makedirs(classifierdir) + except FileExistsError: + # It can exist in some cases. + pass + zipobject.extractall(classifierdir) + except ClientError: + # No worries, it may perfectly not exist. + pass + + return modeldir + + def delete_dir(self): + + s3 = boto3.resource('s3') + + bucketname = os.environ["MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME"] + bucket = s3.Bucket(bucketname) + + # Objectkey will equal uniquemodelid so we delete all files matching + # uniquemodelid/ namespace. + objectkey = self.object_key(False) + + bucket.objects.filter(Prefix=objectkey + '/').delete() + + def object_key(self, hashkey=False): + + uniquemodelid = get_request_value('uniqueid') + + if hashkey is False: + return uniquemodelid + + dirhash = get_request_value(hashkey) + return os.path.join(uniquemodelid, dirhash) + + +class S3_setup_base_dir(object): + '''Sets the localbasedir to /tmp''' + + def __init__(self, storage, fetch_model, push_model): + '''Sets the base dir to a temp directory. + + It fetches the requested model from s3 if required.''' + + self.storage = storage + self.fetch_model = fetch_model + self.push_model = push_model + + # It is our responsibility to delete this directory. However, we are + # relying on the OS to delete it if there is any exception during the + # course of the request. + self.storage.set_localbasedir(tempfile.mkdtemp()) + + def __call__(self, f): + + @wraps(f) + def wrapper(*args, **kwargs): + '''Execute the decorated function. + + Upload the model to s3 if required.''' + + update_wrapper(self, f) + + self.modeldir = self.storage.get_model_dir( + 'dirhash', fetch_model=self.fetch_model) + + # Execute the requested action. + funcreturn = f(*args, **kwargs) + + if self.push_model is True: + # Push the model to s3. + + s3 = boto3.client('s3') + + classifierdir = os.path.join(self.modeldir, 'classifier') + + # Copy the classifier in the model dir to S3. + updatedclassifierzip = tempfile.NamedTemporaryFile() + zipdir(classifierdir, updatedclassifierzip) + + # We are only interested in the model we just trained. + bucketname = os.environ[ + "MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME"] + objectkey = self.storage.object_key('dirhash') + + # Upload to S3. + try: + s3.upload_file( + updatedclassifierzip.name, bucketname, objectkey) + except ClientError as e: + # We don't want the error details in moodle as they could + # contain sensitive information. + logging.error('Error uploading the model to S3: ' + str(e)) + return 'Can\'t upload classifier to S3.', 500 + + # TODO Think about copying the new logs to S3. + + # It is our responsibility to delete tmp directories created with + # mkdtemp + shutil.rmtree(self.storage.get_localbasedir(), True) + + # Now that the files are copied back to S3 we can return f's + # Response. + return funcreturn + + return wrapper diff --git a/moodlemlbackend/webapp/util.py b/moodlemlbackend/webapp/util.py new file mode 100644 index 0000000..3807dc0 --- /dev/null +++ b/moodlemlbackend/webapp/util.py @@ -0,0 +1,48 @@ +import re +import os +import zipfile +import tempfile +import shutil +import atexit + +from flask import request + + +def get_request_value(key, pattern=False, exception=True): + + if pattern is False: + pattern = '[^A-Za-z0-9_\-$]' + + value = request.values.get(key) + if value is None: + + if exception is True: + raise Exception('The requested key ' + key + ' is not available.') + return False + + return re.sub(pattern, '', value) + + +def get_file_path(localbasedir, filekey): + + file = request.files[filekey] + + tempdir = tempfile.mkdtemp() + tempfilepath = os.path.join(tempdir, filekey) + + atexit.register(shutil.rmtree, tempdir) + file.save(tempfilepath) + + return tempfilepath + + +def zipdir(dirpath, zipf): + + ziph = zipfile.ZipFile(zipf, 'w', zipfile.ZIP_DEFLATED) + + for root, dirs, files in os.walk(dirpath): + for file in files: + abspath = os.path.join(root, file) + ziph.write(abspath, os.path.relpath(abspath, root)) + ziph.close() + return ziph diff --git a/setup.py b/setup.py index 3a198fe..e7738e6 100644 --- a/setup.py +++ b/setup.py @@ -55,5 +55,7 @@ 'scikit-learn>=0.21,<0.22', 'joblib>=0.13.0,<0.14', 'tensorflow>=1.14.0,<1.15', + 'flask>=1.0.2,<1.1', + 'boto3>=1.9.0,<1.10', ], ) diff --git a/webapp.py b/webapp.py new file mode 100644 index 0000000..f6661b1 --- /dev/null +++ b/webapp.py @@ -0,0 +1,168 @@ +import os +import json +import tempfile +import zipfile + +from flask import Flask, send_file, Response + +from moodlemlbackend.processor import estimator + +from moodlemlbackend.webapp.localfs import LocalFS, LocalFS_setup_base_dir +from moodlemlbackend.webapp.s3 import S3, S3_setup_base_dir +from moodlemlbackend.webapp.access import check_access +from moodlemlbackend.webapp.util import get_request_value, get_file_path +from moodlemlbackend.webapp.util import zipdir + +app = Flask(__name__) + +# S3 or the local file system depending on the presence of this ENV var. +if "MOODLE_MLBACKEND_PYTHON_S3_BUCKET_NAME" in os.environ: + storage = S3() + setup_base_dir = S3_setup_base_dir +else: + storage = LocalFS() + setup_base_dir = LocalFS_setup_base_dir + + +@app.route('/version', methods=['GET']) +def version(): + here = os.path.abspath(os.path.dirname(__file__)) + version_file = open(os.path.join(here, 'moodlemlbackend', 'VERSION')) + return version_file.read().strip() + + +@app.route('/training', methods=['POST']) +@check_access +@setup_base_dir(storage, True, True) +def training(): + + uniquemodelid = get_request_value('uniqueid') + modeldir = storage.get_model_dir('dirhash') + + datasetpath = get_file_path(storage.get_localbasedir(), 'dataset') + + classifier = estimator.Classifier(uniquemodelid, modeldir, datasetpath) + result = classifier.train_dataset(datasetpath) + + return json.dumps(result) + + +@app.route('/prediction', methods=['POST']) +@check_access +@setup_base_dir(storage, True, True) +def prediction(): + + uniquemodelid = get_request_value('uniqueid') + modeldir = storage.get_model_dir('dirhash') + + datasetpath = get_file_path(storage.get_localbasedir(), 'dataset') + + classifier = estimator.Classifier(uniquemodelid, modeldir, datasetpath) + result = classifier.predict_dataset(datasetpath) + + return json.dumps(result) + + +@app.route('/evaluation', methods=['POST']) +@check_access +@setup_base_dir(storage, False, False) +def evaluation(): + + uniquemodelid = get_request_value('uniqueid') + modeldir = storage.get_model_dir('dirhash') + + minscore = get_request_value('minscore', pattern='[^0-9.$]') + maxdeviation = get_request_value('maxdeviation', pattern='[^0-9.$]') + niterations = get_request_value('niterations', pattern='[^0-9$]') + + datasetpath = get_file_path(storage.get_localbasedir(), 'dataset') + + trainedmodeldirhash = get_request_value( + 'trainedmodeldirhash', exception=False) + if trainedmodeldirhash is not False: + # The trained model dir in the server is namespaced by uniquemodelid + # and the trainedmodeldirhash which determines where should the results + # be stored. + trainedmodeldir = storage.get_model_dir( + 'trainedmodeldirhash', fetch_model=True) + else: + trainedmodeldir = False + + classifier = estimator.Classifier(uniquemodelid, modeldir, datasetpath) + result = classifier.evaluate_dataset(datasetpath, + float(minscore), + float(maxdeviation), + int(niterations), + trainedmodeldir) + + return json.dumps(result) + + +@app.route('/evaluationlog', methods=['GET']) +@check_access +@setup_base_dir(storage, True, False) +def evaluationlog(): + + modeldir = storage.get_model_dir('dirhash') + runid = get_request_value('runid', '[^0-9$]') + logsdir = os.path.join(modeldir, 'logs', runid) + + zipf = tempfile.NamedTemporaryFile() + zipdir(logsdir, zipf) + return send_file(zipf.name, mimetype='application/zip') + + +@app.route('/export', methods=['GET']) +@check_access +@setup_base_dir(storage, True, False) +def export(): + + uniquemodelid = get_request_value('uniqueid') + modeldir = storage.get_model_dir('dirhash') + + # We can use a temp directory for the export data + # as we don't need to keep it forever. + tempdir = tempfile.TemporaryDirectory() + + classifier = estimator.Classifier(uniquemodelid, modeldir) + exportdir = classifier.export_classifier(tempdir.name) + if exportdir is False: + return Response('There is nothing to export.', 503) + + zipf = tempfile.NamedTemporaryFile() + zipdir(exportdir, zipf) + + return send_file(zipf.name, mimetype='application/zip') + + +@app.route('/import', methods=['POST']) +@check_access +@setup_base_dir(storage, False, True) +def import_model(): + + uniquemodelid = get_request_value('uniqueid') + modeldir = storage.get_model_dir('dirhash') + + importzippath = get_file_path(storage.get_localbasedir(), 'importzip') + + with zipfile.ZipFile(importzippath, 'r') as zipobject: + importtempdir = tempfile.TemporaryDirectory() + zipobject.extractall(importtempdir.name) + + classifier = estimator.Classifier(uniquemodelid, modeldir) + classifier.import_classifier(importtempdir.name) + + return 'Ok', 200 + + +@app.route('/deletemodel', methods=['POST']) +@check_access +@setup_base_dir(storage, False, False) +def deletemodel(): + # All processing is delegated to delete_dir as it is file system dependant. + storage.delete_dir() + return 'Ok', 200 + + +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0')