diff --git a/docs/conf.py b/docs/conf.py index 2152b6f..2f28038 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,27 @@ +import os +import pathlib +import sys + +import django + +import libcoveweb2.settings + project = "LibCoveWeb2" master_doc = "index" html_theme = "odsc_default_sphinx_theme" + +extensions = [ + "sphinx.ext.autodoc", +] + +###### Make Sphinx able to document our python code + +# We need to be able to see our code, so add this directory to path +sys.path.insert(0, str(pathlib.Path("..").resolve())) + +# We need to set up Django enough that it won't complain +libcoveweb2.settings.INSTALLED_APPS += ("libcoveweb2",) +os.environ["DJANGO_SETTINGS_MODULE"] = "libcoveweb2.settings" +django.setup() diff --git a/docs/django-settings.rst b/docs/django-settings.rst new file mode 100644 index 0000000..eae6880 --- /dev/null +++ b/docs/django-settings.rst @@ -0,0 +1,48 @@ +Django Settings +=============== + +To use this app you'll need to define several settings + + + +Process Tasks +------------- + +You need to define a `PROCESS_TASKS` setting. This lists all the tasks that will be processed for each uploaded data, in order of processing. + +It should be a list of tuples and every tuple should be `('Python module', 'Python class name')`. +Each class should extend libcoveweb2.process.base.ProcessDataTask + +Example: + +.. code-block:: python + + PROCESS_TASKS = [ + # Get data if not already on disk + ("libcoveweb2.process.common_tasks.download_data_task", "DownloadDataTask"), + ... + ] + +Celery Message Queue +-------------------- + +Any Celery settings needed must be set up. + +At a minimum this will include `CELERY_BROKER_URL`. + + +Settings to copy from library which have sensible defaults +---------------------------------------------------------- + +This application also needs a bunch of configuration values that already have defaults set. In most cases you can just reuse these variables. + +:doc:`For a list of these settings see here. ` + +To do so, you can do something like this in your Django project's main setting.py file: + +.. code-block:: python + + from libcoveweb2 import settings + ALLOWED_JSON_CONTENT_TYPES = settings.ALLOWED_JSON_CONTENT_TYPES + ALLOWED_JSON_EXTENSIONS = settings.ALLOWED_JSON_EXTENSIONS + ... diff --git a/docs/hosting/requirements.rst b/docs/hosting/requirements.rst new file mode 100644 index 0000000..b3f9bd3 --- /dev/null +++ b/docs/hosting/requirements.rst @@ -0,0 +1,30 @@ +Hosting Requirements +==================== + +Python server for the Django app +---------------------------------- + +Normal options + +Database +---------- + +This is tested with PostgreSQL. + +Message queue compatible with Celery +-------------------------------------- + +Normal options + +File Storage +------------ + +TODO + +Cron tasks +---------- + +Some Django management commands should be run on a cron task. + +* `expire_files` should be run daily + diff --git a/docs/index.rst b/docs/index.rst index d23eedc..ae1e5af 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,3 +1,31 @@ LibCoveWeb2 =========== + +LibCoveWeb2 is a Django application to use as a library in your own Django apps. + +It let's you create CoVE apps - CoVE exists to help people: + +* Convert data between common formats (e.g. csv to json) +* Validate data against rules +* Explore data, that machines find easy, but humans find harder to read + +The application consists of: + +* Database Models to save details of user submitted data +* File storage space to save the user submitted data and cache results of processing +* A message queue +* Workers to process the data according to tasks you provide (but there is a library of common tasks in this application) +* A view to show users output information from the cache of results + + +.. toctree:: + :maxdepth: 2 + + processing-pipeline.rst + django-settings.rst + python-api/index.rst + migration-from-lib-cove-web.rst + hosting/requirements.rst + used-by.rst + diff --git a/docs/migration-from-lib-cove-web.rst b/docs/migration-from-lib-cove-web.rst new file mode 100644 index 0000000..ce96db7 --- /dev/null +++ b/docs/migration-from-lib-cove-web.rst @@ -0,0 +1,13 @@ +Migration from lib-cove-web +=========================== + + +This library is an updated version of the previous library ( https://github.com/OpenDataServices/lib-cove-web ). + +However, the structure of the library and how it works have massive changes. +For this reason the upgrade path isn't a simple upgrade, but requires rewriting work in the software that uses this. + +This is why this library is a separate git repository and a separate pypi package, +so that development of the previous library can also continue and users are not forced to upgrade before they are ready. + + diff --git a/docs/processing-pipeline.rst b/docs/processing-pipeline.rst new file mode 100644 index 0000000..3b212e0 --- /dev/null +++ b/docs/processing-pipeline.rst @@ -0,0 +1,84 @@ +Processing Pipeline +=================== + +Why? +---- + +The application lets you define a series of tasks that will be checked for each bit of uploaded data, in order. + +Tasks need to be defined by each app, but there is a library of common tasks to make this easier. + +This allows for maximum flexibility - each app can define the tasks they need, including non-standard tasks that are not used by other CoVE's. +(For example, BODS CoVE has a sample mode. When the user uploads big data, they can choose to run sample mode and only check some of it. +This is accomplished by a special task towards the start of the pipeline that generates a smaller file from the uploaded file.) + +What happens when the user uploads data? +---------------------------------------- + +The background worker will start processing the data and the user will be redirected to the results page. + +What happens when the user looks at a results page? +--------------------------------------------------- + +Everytime a user views a results page, the system will check the state of that data. + +If it's currently being processed, the user will see a progress page with a wait message. + +If it's not currently being processed, the system will call `is_processing_applicable` and `is_processing_needed` functions on each task to see if any work is needed. + +If there is work to do, it will start the work and the user will see a progress page with a wait message. +This means that even after a task first finishes, a task can change it's mind and request to do more work. +(The most common use case for this is if the software is upgraded and how the processing is done is changed.) + +If there is no work to do, the system will show a results page to the user. +`get_context` will be called on every task, so the task can load results from it's cache and present them to the user. + +Other pages that may be shown to the user include: + * An error page if a Python error occurred + * An expired page, if the data is so old that it has been expired and removed from the system + +How is the data actually processed? +----------------------------------- + +To process the task, the background worker will call `process`. +This can take as long as it needs, and the results should be cached for speedy loading later. + +Early tasks can also return data that will be passed to later tasks. +This means any information or work that is needed in multiple tasks does not need to be done multiple times, but can be done once then reused. + + +How should I define my tasks? +----------------------------- + + +Each task should be defined by extending a class. :doc:`For more information on the base class, see here. ` + +And your tasks should then be defined in settings. :doc:`For more information on settings, see here. ` + +An example task pipeline +------------------------ + +.. code-block:: python + + + PROCESS_TASKS = [ + # Get data if not already on disk - if the user provided a URL + ("libcoveweb2.process.common_tasks.download_data_task", "DownloadDataTask"), + # BOD's has a special Sample mode. + # If that's activated, we'll make the sample data now for later tasks to use. + ("cove_bods.process", "Sample"), + # Make sure uploads are in primary format - for BOD's that is JSON + # So any spreadsheets uploaded should be converted + ("cove_bods.process", "WasJSONUploaded"), + ("cove_bods.process", "ConvertSpreadsheetIntoJSON"), + # Some information is reused in multiple tasks to come + # So we'll process it once now and later tasks can reuse it. + ("cove_bods.process", "GetDataReaderAndConfigAndSchema"), + # Convert from primary JSON format into other output formats + ("cove_bods.process", "ConvertJSONIntoSpreadsheets"), + # Check and generate statistics from the JSON data + ("cove_bods.process", "AdditionalFieldsChecksTask"), + ("cove_bods.process", "PythonValidateTask"), + ("cove_bods.process", "JsonSchemaValidateTask"), + ] + diff --git a/docs/python-api/index.rst b/docs/python-api/index.rst new file mode 100644 index 0000000..8c77df4 --- /dev/null +++ b/docs/python-api/index.rst @@ -0,0 +1,15 @@ +Python API +========== + +This section documents the Python API that software using this library should consider. + +It does not document Python that is not intended for reuse by others (you can read the source code for that.) + + +.. toctree:: + :maxdepth: 2 + + settings.rst + process/base.rst + process/common_tasks/download_data_task.rst + process/common_tasks/task_with_state.rst diff --git a/docs/python-api/process/base.rst b/docs/python-api/process/base.rst new file mode 100644 index 0000000..7f0fe96 --- /dev/null +++ b/docs/python-api/process/base.rst @@ -0,0 +1,11 @@ +Process Task Base +================= + +:doc:`For more information on how to use this class, see here. <../../processing-pipeline>` + +You probably want to process some data, get a result and cache it. +There is another class `TaskWithState` that helps you do this, and so may be more useful for you. +:doc:`For more information on this helper class, see here. ` + +.. autoclass:: libcoveweb2.process.base.ProcessDataTask + :members: diff --git a/docs/python-api/process/common_tasks/download_data_task.rst b/docs/python-api/process/common_tasks/download_data_task.rst new file mode 100644 index 0000000..f33e39a --- /dev/null +++ b/docs/python-api/process/common_tasks/download_data_task.rst @@ -0,0 +1,5 @@ +Common Process Task: Download Data Task +======================================= + + +.. autoclass:: libcoveweb2.process.common_tasks.download_data_task.DownloadDataTask diff --git a/docs/python-api/process/common_tasks/task_with_state.rst b/docs/python-api/process/common_tasks/task_with_state.rst new file mode 100644 index 0000000..ec27fe6 --- /dev/null +++ b/docs/python-api/process/common_tasks/task_with_state.rst @@ -0,0 +1,7 @@ +Common Process Task: Task With State +==================================== + + +.. autoclass:: libcoveweb2.process.common_tasks.task_with_state.TaskWithState + :members: state_filename, process_get_state + diff --git a/docs/python-api/settings.rst b/docs/python-api/settings.rst new file mode 100644 index 0000000..e1b0ae4 --- /dev/null +++ b/docs/python-api/settings.rst @@ -0,0 +1,12 @@ +Settings +======== + +The application includes a settings module with several settings with default values. +If you don't need to change them, you can just take the defaults from this module. + +:doc:`For more information on setting up the settings you need, see here. <../django-settings>` + + +.. automodule:: libcoveweb2.settings + :members: + diff --git a/docs/used-by.rst b/docs/used-by.rst new file mode 100644 index 0000000..9e457ff --- /dev/null +++ b/docs/used-by.rst @@ -0,0 +1,10 @@ +Used by +======= + +This library is used by: + +* https://github.com/openownership/cove-bods +* https://github.com/Open-Telecoms-Data/cove-ofds +* https://github.com/GFDRR/rdls-cove + + diff --git a/libcoveweb2/process/base.py b/libcoveweb2/process/base.py index a7f8374..aeacc78 100644 --- a/libcoveweb2/process/base.py +++ b/libcoveweb2/process/base.py @@ -18,12 +18,18 @@ def is_processing_applicable(self) -> bool: eg. A task to convert a spreadsheet to JSON will never be applicable if JSON is uploaded in the first place. + eg. A task to check the data against JSON Schema will always be applicable. + + This method is called on a user request on the web; so it must not take a long time to finish. """ return False def is_processing_needed(self) -> bool: - """Should return True if this task needs to do any processing""" + """Should return True if this task needs to do any processing. + + This method is called on a user request on the web; so it must not take a long time to finish. + """ return False def process(self, process_data: dict) -> dict: @@ -39,7 +45,10 @@ def process(self, process_data: dict) -> dict: You should do your own checks to make sure you are not doing unneeded work. This is so you can still add relevant info to process_data dict. - But it's not called if is_processing_applicable() is false.""" + But it's not called if is_processing_applicable() is false. + + This is only called by the background worker, so it can take as long as it needs to do it's work. + """ return process_data def get_context(self): diff --git a/libcoveweb2/process/common_tasks/download_data_task.py b/libcoveweb2/process/common_tasks/download_data_task.py index 17e05bf..dd65aaa 100644 --- a/libcoveweb2/process/common_tasks/download_data_task.py +++ b/libcoveweb2/process/common_tasks/download_data_task.py @@ -2,7 +2,9 @@ class DownloadDataTask(ProcessDataTask): - """If user gave us a URL, we download it now.""" + """If user gave us a URL, we download it now. + + It is possible for apps to use this class with no further configuration.""" def is_processing_applicable(self) -> bool: for supplied_data_file in self.supplied_data_files: diff --git a/libcoveweb2/process/common_tasks/task_with_state.py b/libcoveweb2/process/common_tasks/task_with_state.py index 9ae068b..0a82ebd 100644 --- a/libcoveweb2/process/common_tasks/task_with_state.py +++ b/libcoveweb2/process/common_tasks/task_with_state.py @@ -13,17 +13,21 @@ class TaskWithState(ProcessDataTask): Extend and provide your own state_filename and process_get_state. """ - """Set state_filename to a unique name for each task. - If you change this name the task will be rerun, so this is is a good way to - make sure all underlying data changes if a new version of this bit of cove - is released.""" - state_filename: str = "task_with_state.py" + #: Set state_filename to a unique name for each task. + #: + #: If you change this name the task will be rerun, so this is is a good way to + #: make sure all underlying data changes if a new version of this bit of cove + #: is released. + state_filename: str = "task_with_state.json" def process_get_state(self, process_data: dict): - """Should return a dict that is the state to save, and process_data. - Is only called if there is work to do, so does not need to worry about - checking that. - """ + """Called to process data. + + Is only called if there is work to do, so does not need to worry about checking that. + + Should return a tuple. + The first item is the results to save, as a dictionary. + The second item is process_data, as a dictionary.""" return {}, process_data def process(self, process_data: dict) -> dict: diff --git a/libcoveweb2/settings.py b/libcoveweb2/settings.py index 8938158..f4a30c9 100644 --- a/libcoveweb2/settings.py +++ b/libcoveweb2/settings.py @@ -29,16 +29,20 @@ DELETE_FILES_AFTER_DAYS=(int, 7), ) +#: Details for Piwik/Matamo tracking, if wanted. Should be a dictionary with the following keys: url, site_id and dimension_map PIWIK = { "url": env("PIWIK_URL"), "site_id": env("PIWIK_SITE_ID"), "dimension_map": env("PIWIK_DIMENSION_MAP"), } +#: Details for Google Analytics tracking, if wanted. Should be a string of the id. GOOGLE_ANALYTICS_ID = env("GOOGLE_ANALYTICS_ID") VALIDATION_ERROR_LOCATIONS_LENGTH = env("VALIDATION_ERROR_LOCATIONS_LENGTH") VALIDATION_ERROR_LOCATIONS_SAMPLE = env("VALIDATION_ERROR_LOCATIONS_SAMPLE") +#: Uploaded data is deleted after a certain number of days to protect privacy, leaving only meta data for usage analysis. +#: Should be an integer of the number of days. DELETE_FILES_AFTER_DAYS = env("DELETE_FILES_AFTER_DAYS") MEDIA_ROOT = os.path.join(BASE_DIR, "media") @@ -173,37 +177,42 @@ }, } -# Sometimes uploads happen with a generic content type. -# In this case, we can't rely on content type to detect type. -# But the type is still allowed, so it's added to -# ALLOWED_*_CONTENT_TYPES when they are defined. +#: Sometimes uploads happen with a generic content type. +#: In this case, we can't rely on content type to detect type. +#: But the type is still allowed. Define these generic content-types in a handy variable +#: so we can also add them to all the ALLOWED_*_CONTENT_TYPES settings. ALLOWED_UNKNOWN_CONTENT_TYPES = ["application/octet-stream"] -# JSON details +#: Allowed types for JSON files ALLOWED_JSON_CONTENT_TYPES = ["application/json"] + ALLOWED_UNKNOWN_CONTENT_TYPES +#: Allowed file extensions for JSON files ALLOWED_JSON_EXTENSIONS = [".json"] -# Excel details +#: Allowed types for Excel files ALLOWED_SPREADSHEET_EXCEL_CONTENT_TYPES = [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ] + ALLOWED_UNKNOWN_CONTENT_TYPES +#: Allowed file extensions for Excel files ALLOWED_SPREADSHEET_EXCEL_EXTENSIONS = [".xlsx"] -# Open Document details +#: Allowed types for Open Document files ALLOWED_SPREADSHEET_OPENDOCUMENT_CONTENT_TYPES = [ "application/vnd.oasis.opendocument.spreadsheet", ] + ALLOWED_UNKNOWN_CONTENT_TYPES +#: Allowed file extensions for Open Document files ALLOWED_SPREADSHEET_OPENDOCUMENT_EXTENSIONS = [".ods"] -# Spreadsheet details (sum of details above) +#: Allowed types for Spreadsheet files (sum of Excel and Open Document) ALLOWED_SPREADSHEET_CONTENT_TYPES = ( ALLOWED_SPREADSHEET_EXCEL_CONTENT_TYPES + ALLOWED_SPREADSHEET_OPENDOCUMENT_CONTENT_TYPES ) +#: Allowed file extensions for Spreadsheet files (sum of Excel and Open Document) ALLOWED_SPREADSHEET_EXTENSIONS = ( ALLOWED_SPREADSHEET_EXCEL_EXTENSIONS + ALLOWED_SPREADSHEET_OPENDOCUMENT_EXTENSIONS ) -# CSV Details +#: Allowed types for CSV files ALLOWED_CSV_CONTENT_TYPES = ["text/csv"] + ALLOWED_UNKNOWN_CONTENT_TYPES +#: Allowed file extensions for CSV files ALLOWED_CSV_EXTENSIONS = [".csv"]