Merge pull request #150 from The-Strategy-Unit/1.0.1

1.0.1
The-Strategy-Unit · Feb 15, 2024 · fc6c41f · fc6c41f
2 parents 8c8f82f + 2417574
commit fc6c41f
Show file tree

Hide file tree

Showing 21 changed files with 2,179 additions and 1,940 deletions.
diff --git a/.github/workflows/test_package.yaml b/.github/workflows/test_package.yaml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/Dockerfile b/Dockerfile
@@ -13,6 +13,6 @@ COPY current_best_model/final_svc/final_svc.sav /final_svc.sav
 COPY current_best_model/final_xgb/final_xgb.sav /final_xgb.sav
 COPY --chmod=755 docker_run.py docker_run.py
 
-LABEL org.opencontainers.image.source=https://github.com/cdu-data-science-team/pxtextmining
+LABEL org.opencontainers.image.source=https://github.com/the-strategy-unit/pxtextmining
 
 ENTRYPOINT ["python3", "docker_run.py"]
diff --git a/api/api.py b/api/api.py
@@ -3,9 +3,9 @@
 from typing import List
 
 import pandas as pd
+import schemas
 from fastapi import FastAPI
 
-from api import schemas
 from pxtextmining.factories.factory_predict_unlabelled_text import (
     predict_multilabel_sklearn,
 )
@@ -31,12 +31,12 @@
     version="1.0.0",
     contact={
         "name": "Patient Experience Qualitative Data Categorisation",
-        "url": "https://cdu-data-science-team.github.io/PatientExperience-QDC/",
+        "url": "https://the-strategy-unit.github.io/PatientExperience-QDC/",
         "email": "chris.beeley1@nhs.net",
     },
     license_info={
         "name": "MIT License",
-        "url": "https://github.com/CDU-data-science-team/pxtextmining/blob/main/LICENSE",
+        "url": "https://github.com/the-strategy-unit/pxtextmining/blob/main/LICENSE",
     },
     openapi_tags=tags_metadata,
 )

diff --git a/api/requirements.txt b/api/requirements.txt
@@ -1,100 +1,98 @@
-absl-py==2.0.0 ; python_version >= "3.8" and python_version < "3.11"
-anyio==4.0.0 ; python_version >= "3.8" and python_version < "3.11"
+absl-py==2.1.0 ; python_version >= "3.8" and python_version < "3.11"
+anyio==4.2.0 ; python_version >= "3.8" and python_version < "3.11"
 astunparse==1.6.3 ; python_version >= "3.8" and python_version < "3.11"
-cachetools==5.3.1 ; python_version >= "3.8" and python_version < "3.11"
-certifi==2023.7.22 ; python_version >= "3.8" and python_version < "3.11"
+cachetools==5.3.2 ; python_version >= "3.8" and python_version < "3.11"
+certifi==2024.2.2 ; python_version >= "3.8" and python_version < "3.11"
 cfgv==3.4.0 ; python_version >= "3.8" and python_version < "3.11"
-charset-normalizer==3.3.0 ; python_version >= "3.8" and python_version < "3.11"
+charset-normalizer==3.3.2 ; python_version >= "3.8" and python_version < "3.11"
 click==8.1.7 ; python_version >= "3.8" and python_version < "3.11"
-colorama==0.4.6 ; python_version >= "3.8" and python_version < "3.11" and sys_platform == "win32" or python_version >= "3.8" and python_version < "3.11" and platform_system == "Windows"
+colorama==0.4.6 ; python_version >= "3.8" and python_version < "3.11" and (sys_platform == "win32" or platform_system == "Windows")
 contourpy==1.1.1 ; python_version >= "3.8" and python_version < "3.11"
-coverage[toml]==7.3.2 ; python_version >= "3.8" and python_version < "3.11"
-cycler==0.12.0 ; python_version >= "3.8" and python_version < "3.11"
-distlib==0.3.7 ; python_version >= "3.8" and python_version < "3.11"
-exceptiongroup==1.1.3 ; python_version >= "3.8" and python_version < "3.11"
+coverage[toml]==7.4.1 ; python_version >= "3.8" and python_version < "3.11"
+cycler==0.12.1 ; python_version >= "3.8" and python_version < "3.11"
+distlib==0.3.8 ; python_version >= "3.8" and python_version < "3.11"
+exceptiongroup==1.2.0 ; python_version >= "3.8" and python_version < "3.11"
 fastapi==0.101.1 ; python_version >= "3.8" and python_version < "3.11"
-filelock==3.12.4 ; python_version >= "3.8" and python_version < "3.11"
+filelock==3.13.1 ; python_version >= "3.8" and python_version < "3.11"
 flatbuffers==23.5.26 ; python_version >= "3.8" and python_version < "3.11"
-fonttools==4.43.0 ; python_version >= "3.8" and python_version < "3.11"
-fsspec==2023.9.2 ; python_version >= "3.8" and python_version < "3.11"
+fonttools==4.48.1 ; python_version >= "3.8" and python_version < "3.11"
+fsspec==2024.2.0 ; python_version >= "3.8" and python_version < "3.11"
 gast==0.4.0 ; python_version >= "3.8" and python_version < "3.11"
 google-auth-oauthlib==1.0.0 ; python_version >= "3.8" and python_version < "3.11"
-google-auth==2.23.2 ; python_version >= "3.8" and python_version < "3.11"
+google-auth==2.27.0 ; python_version >= "3.8" and python_version < "3.11"
 google-pasta==0.2.0 ; python_version >= "3.8" and python_version < "3.11"
-grpcio==1.59.0 ; python_version >= "3.8" and python_version < "3.11"
+grpcio==1.60.1 ; python_version >= "3.8" and python_version < "3.11"
 h11==0.14.0 ; python_version >= "3.8" and python_version < "3.11"
-h5py==3.9.0 ; python_version >= "3.8" and python_version < "3.11"
+h5py==3.10.0 ; python_version >= "3.8" and python_version < "3.11"
 httpcore==0.16.3 ; python_version >= "3.8" and python_version < "3.11"
 httpx==0.23.3 ; python_version >= "3.8" and python_version < "3.11"
-huggingface-hub==0.16.4 ; python_version >= "3.8" and python_version < "3.11"
-identify==2.5.30 ; python_version >= "3.8" and python_version < "3.11"
-idna==3.4 ; python_version >= "3.8" and python_version < "3.11"
-importlib-metadata==6.8.0 ; python_version >= "3.8" and python_version < "3.10"
-importlib-resources==6.1.0 ; python_version >= "3.8" and python_version < "3.10"
+huggingface-hub==0.20.3 ; python_version >= "3.8" and python_version < "3.11"
+identify==2.5.34 ; python_version >= "3.8" and python_version < "3.11"
+idna==3.6 ; python_version >= "3.8" and python_version < "3.11"
+importlib-metadata==7.0.1 ; python_version >= "3.8" and python_version < "3.10"
+importlib-resources==6.1.1 ; python_version >= "3.8" and python_version < "3.10"
 iniconfig==2.0.0 ; python_version >= "3.8" and python_version < "3.11"
 jax==0.4.13 ; python_version >= "3.8" and python_version < "3.11"
 joblib==1.3.2 ; python_version >= "3.8" and python_version < "3.11"
 keras==2.12.0 ; python_version >= "3.8" and python_version < "3.11"
 kiwisolver==1.4.5 ; python_version >= "3.8" and python_version < "3.11"
 libclang==16.0.6 ; python_version >= "3.8" and python_version < "3.11"
-markdown==3.4.4 ; python_version >= "3.8" and python_version < "3.11"
-markupsafe==2.1.3 ; python_version >= "3.8" and python_version < "3.11"
-matplotlib==3.7.3 ; python_version >= "3.8" and python_version < "3.11"
+markdown==3.5.2 ; python_version >= "3.8" and python_version < "3.11"
+markupsafe==2.1.5 ; python_version >= "3.8" and python_version < "3.11"
+matplotlib==3.7.4 ; python_version >= "3.8" and python_version < "3.11"
 ml-dtypes==0.2.0 ; python_version >= "3.8" and python_version < "3.11"
 nodeenv==1.8.0 ; python_version >= "3.8" and python_version < "3.11"
-numpy==1.23.5 ; python_version < "3.11" and python_version >= "3.8"
+numpy==1.23.5 ; python_version >= "3.8" and python_version < "3.11"
 oauthlib==3.2.2 ; python_version >= "3.8" and python_version < "3.11"
 opt-einsum==3.3.0 ; python_version >= "3.8" and python_version < "3.11"
 packaging==23.2 ; python_version >= "3.8" and python_version < "3.11"
 pandas==1.5.3 ; python_version >= "3.8" and python_version < "3.11"
-pillow==10.0.1 ; python_version >= "3.8" and python_version < "3.11"
-platformdirs==3.11.0 ; python_version >= "3.8" and python_version < "3.11"
-pluggy==1.3.0 ; python_version >= "3.8" and python_version < "3.11"
-pre-commit==3.4.0 ; python_version >= "3.8" and python_version < "3.11"
-protobuf==4.24.4 ; python_version >= "3.8" and python_version < "3.11"
+pillow==10.2.0 ; python_version >= "3.8" and python_version < "3.11"
+platformdirs==4.2.0 ; python_version >= "3.8" and python_version < "3.11"
+pluggy==1.4.0 ; python_version >= "3.8" and python_version < "3.11"
+pre-commit==3.5.0 ; python_version >= "3.8" and python_version < "3.11"
+protobuf==4.25.2 ; python_version >= "3.8" and python_version < "3.11"
 pyasn1-modules==0.3.0 ; python_version >= "3.8" and python_version < "3.11"
-pyasn1==0.5.0 ; python_version >= "3.8" and python_version < "3.11"
-pydantic==1.10.13 ; python_version >= "3.8" and python_version < "3.11"
+pyasn1==0.5.1 ; python_version >= "3.8" and python_version < "3.11"
+pydantic==1.10.14 ; python_version >= "3.8" and python_version < "3.11"
 pyparsing==3.1.1 ; python_version >= "3.8" and python_version < "3.11"
 pytest-cov==4.1.0 ; python_version >= "3.8" and python_version < "3.11"
-pytest-mock==3.11.1 ; python_version >= "3.8" and python_version < "3.11"
-pytest==7.4.2 ; python_version >= "3.8" and python_version < "3.11"
+pytest-mock==3.12.0 ; python_version >= "3.8" and python_version < "3.11"
+pytest==7.4.4 ; python_version >= "3.8" and python_version < "3.11"
 python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "3.11"
-pytz==2023.3.post1 ; python_version >= "3.8" and python_version < "3.11"
+pytz==2024.1 ; python_version >= "3.8" and python_version < "3.11"
 pyyaml==6.0.1 ; python_version >= "3.8" and python_version < "3.11"
-regex==2023.10.3 ; python_version >= "3.8" and python_version < "3.11"
+regex==2023.12.25 ; python_version >= "3.8" and python_version < "3.11"
 requests-oauthlib==1.3.1 ; python_version >= "3.8" and python_version < "3.11"
 requests==2.31.0 ; python_version >= "3.8" and python_version < "3.11"
 rfc3986[idna2008]==1.5.0 ; python_version >= "3.8" and python_version < "3.11"
 rsa==4.9 ; python_version >= "3.8" and python_version < "3.11"
 ruff==0.0.272 ; python_version >= "3.8" and python_version < "3.11"
-safetensors==0.3.3 ; python_version >= "3.8" and python_version < "3.11"
+safetensors==0.4.2 ; python_version >= "3.8" and python_version < "3.11"
 scikit-learn==1.0.2 ; python_version >= "3.8" and python_version < "3.11"
 scipy==1.10.1 ; python_version >= "3.8" and python_version < "3.11"
-setuptools-scm==8.0.4 ; python_version >= "3.8" and python_version < "3.11"
-setuptools==68.2.2 ; python_version >= "3.8" and python_version < "3.11"
+setuptools==69.1.0 ; python_version >= "3.8" and python_version < "3.11"
 six==1.16.0 ; python_version >= "3.8" and python_version < "3.11"
 sniffio==1.3.0 ; python_version >= "3.8" and python_version < "3.11"
 starlette==0.27.0 ; python_version >= "3.8" and python_version < "3.11"
-tensorboard-data-server==0.7.1 ; python_version >= "3.8" and python_version < "3.11"
+tensorboard-data-server==0.7.2 ; python_version >= "3.8" and python_version < "3.11"
 tensorboard==2.12.3 ; python_version >= "3.8" and python_version < "3.11"
 tensorflow-estimator==2.12.0 ; python_version >= "3.8" and python_version < "3.11"
-tensorflow-io-gcs-filesystem==0.34.0 ; python_version >= "3.8" and python_version < "3.11" and platform_machine != "arm64" or python_version >= "3.8" and python_version < "3.11" and platform_system != "Darwin"
+tensorflow-io-gcs-filesystem==0.36.0 ; python_version >= "3.8" and python_version < "3.11" and platform_machine != "arm64" or python_version >= "3.8" and python_version < "3.11" and platform_system != "Darwin"
 tensorflow==2.12.0 ; python_version >= "3.8" and python_version < "3.11"
-termcolor==2.3.0 ; python_version >= "3.8" and python_version < "3.11"
-threadpoolctl==3.2.0 ; python_version >= "3.8" and python_version < "3.11"
-tokenizers==0.14.0 ; python_version >= "3.8" and python_version < "3.11"
+termcolor==2.4.0 ; python_version >= "3.8" and python_version < "3.11"
+threadpoolctl==3.3.0 ; python_version >= "3.8" and python_version < "3.11"
+tokenizers==0.15.2 ; python_version >= "3.8" and python_version < "3.11"
 tomli==2.0.1 ; python_version >= "3.8" and python_version < "3.11"
-tornado==6.3.3 ; python_version >= "3.8" and python_version < "3.11"
-tqdm==4.66.1 ; python_version >= "3.8" and python_version < "3.11"
-transformers==4.34.0 ; python_version >= "3.8" and python_version < "3.11"
-typing-extensions==4.8.0 ; python_version >= "3.8" and python_version < "3.11"
-urllib3==2.0.6 ; python_version >= "3.8" and python_version < "3.11"
+tornado==6.4 ; python_version >= "3.8" and python_version < "3.11"
+tqdm==4.66.2 ; python_version >= "3.8" and python_version < "3.11"
+transformers==4.37.2 ; python_version >= "3.8" and python_version < "3.11"
+typing-extensions==4.9.0 ; python_version >= "3.8" and python_version < "3.11"
+urllib3==2.2.0 ; python_version >= "3.8" and python_version < "3.11"
 uvicorn==0.20.0 ; python_version >= "3.8" and python_version < "3.11"
-virtualenv==20.24.5 ; python_version >= "3.8" and python_version < "3.11"
-werkzeug==3.0.0 ; python_version >= "3.8" and python_version < "3.11"
-wheel==0.41.2 ; python_version >= "3.8" and python_version < "3.11"
+virtualenv==20.25.0 ; python_version >= "3.8" and python_version < "3.11"
+werkzeug==3.0.1 ; python_version >= "3.8" and python_version < "3.11"
+wheel==0.42.0 ; python_version >= "3.8" and python_version < "3.11"
 wrapt==1.14.1 ; python_version >= "3.8" and python_version < "3.11"
 xgboost==1.7.6 ; python_version >= "3.8" and python_version < "3.11"
 zipp==3.17.0 ; python_version >= "3.8" and python_version < "3.10"
-pxtextmining==1.0.0
diff --git a/datasets/README.md b/datasets/README.md
@@ -29,4 +29,4 @@ Person identifiable info?: Whether or not the FFT answer contains any person ide
 
 Comment sentiment: The sentiment score applied to the FFT answer by the labeller. 1 is "very positive", 5 is "very negative". Mixed comments have been labelled as "3", neutral.
 
-All other columns are the qualitative framework labels, in one hot encoded format. The version of the framework being used is reflected in the filename. Full details of the framework are available on the [project documentation website](https://cdu-data-science-team.github.io/PatientExperience-QDC/framework/framework3.html).
+All other columns are the qualitative framework labels, in one hot encoded format. The version of the framework being used is reflected in the filename. Full details of the framework are available on the [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/framework/framework3.html).
diff --git a/docker_README.md b/docker_README.md
@@ -1,6 +1,6 @@
 # pxtextmining: Text Classification of Patient Experience feedback
 
-This Docker container contains the pxtextmining machine learning models trained as part of the [Patient Experience Qualitative Data Categorisation project](https://cdu-data-science-team.github.io/PatientExperience-QDC/).
+This Docker container contains the pxtextmining machine learning models trained as part of the [Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/).
 
 To use this Docker container to predict your unlabelled text:
 
@@ -87,7 +87,7 @@ docker_data/
    - `--target` or `-t` to select the machine learning models used. Options are `m` for multilabel, `s` for `sentiment`, or `ms` for both. Defaults to `ms` if nothing is selected.
 
 A sample command would be:
-`docker run --rm -it -v /docker_data:/data ghcr.io/cdu-data-science-team/pxtextmining:latest file_01.json -l `
+`docker run --rm -it -v /docker_data:/data ghcr.io/the-strategy-unit/pxtextmining:latest file_01.json -l `
 
 6. The predictions will be outputted as a json file in the data_out folder, with the same filename. After running successfully, the final folder structure should be:
 

diff --git a/docs/about.md b/docs/about.md
@@ -1,7 +1,7 @@
 # Project background
 
-The `pxtextmining` package is part of [the Patient Experience Qualitative Data Categorisation project](https://cdu-data-science-team.github.io/PatientExperience-QDC/). This project is is hosted by Nottinghamshire Healthcare NHS Foundation Trust's Clinical Development Unit Data Science Team, and funded by NHS England's Insight and Feedback Team.
+The `pxtextmining` package is part of [the Patient Experience Qualitative Data Categorisation project](https://the-strategy-unit.github.io/PatientExperience-QDC/). This project is is hosted by Nottinghamshire Healthcare NHS Foundation Trust's Clinical Development Unit Data Science Team, and funded by NHS England's Insight and Feedback Team.
 
 The primary objective of the `pxtextmining` element is to create a machine learning model capable of categorising the free text data obtained through the [NHS England Friends and Family Test](https://www.england.nhs.uk/fft/) (FFT). It is a multilabel classification problem, with one or more categories applied to each patient feedback comment. In this way, we hope to support better use of qualitative patient experience feedback by NHS provider organisations.
 
-This package works together with the [experiencesdashboard](https://github.com/CDU-data-science-team/experiencesdashboard), a frontend coded in R/Shiny.
+This package works together with the [experiencesdashboard](https://github.com/the-strategy-unit/experiencesdashboard), a frontend coded in R/Shiny.
diff --git a/docs/getting started/install.md b/docs/getting started/install.md
@@ -1,6 +1,6 @@
 ## Installation
 
-You can install `pxtextmining` from either [PyPI](https://pypi.org/project/pxtextmining/) or [GitHub](https://github.com/CDU-data-science-team/pxtextmining).
+You can install `pxtextmining` from either [PyPI](https://pypi.org/project/pxtextmining/) or [GitHub](https://github.com/the-strategy-unit/pxtextmining).
 
 The recommended method is to clone the repository from GitHub, as this will also include the models and datasets.
 

diff --git a/docs/getting started/package.md b/docs/getting started/package.md
@@ -11,7 +11,7 @@ This module contains vast majority of the code in the package. There are five di
 
       - `factory_pipeline`: Construction and training of different models/estimators/algorithms using the `sklearn`, `tensorflow.keras` and `transformers` libraries.
 
-      - `factory_model_performance`: Evaluation of a trained model, comparing predicted targets with real target values, to produce performance metrics. The decision-making process behind the peformance metrics chosen can be seen on the [project documentation website](https://cdu-data-science-team.github.io/PatientExperience-QDC/pxtextmining/performance_metrics.html). The performance metrics for the current best models utilised in the API can be found in the `current_best_multilabel` folder in the main repository.
+      - `factory_model_performance`: Evaluation of a trained model, comparing predicted targets with real target values, to produce performance metrics. The decision-making process behind the peformance metrics chosen can be seen on the [project documentation website](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/performance_metrics.html). The performance metrics for the current best models utilised in the API can be found in the `current_best_multilabel` folder in the main repository.
 
       - `factory_predict_unlabelled_text`: Prepares unlabelled text (with or without additional features such as question type) in a format suitable for each model type, and passes this through the selected models, to produce predicted labels.
 

diff --git a/docs/index.md b/docs/index.md
@@ -2,7 +2,7 @@
 
 This site contains the project documentation for the `pxtextmining` python package.
 This provides a technical overview of the package; for a non-technical overview and further information, visit the
-[Patient Experience Qualitative Data Categorisation website](https://cdu-data-science-team.github.io/PatientExperience-QDC/pxtextmining/).
+[Patient Experience Qualitative Data Categorisation website](https://the-strategy-unit.github.io/PatientExperience-QDC/pxtextmining/).
 
 ## Table Of Contents
 

diff --git a/docs/reference/API/API.md b/docs/reference/API/API.md
@@ -1,6 +1,6 @@
-# pxtextmining APIs
+# pxtextmining API overview
 
-We have created two different APIs for labelling patient experience feedback. Both APIs are free to use and completely open source. For help and support with using them, please contact (Chris Beeley)[chris.beeley1@nhs.net].
+We have created two different APIs for labelling patient experience feedback. Both APIs are free to use and completely open source. For help and support with using them, please contact [Chris Beeley](mailto:chris.beeley1@nhs.net).
 
 The "Quick API" is faster and simpler, as it uses an sklearn model which is quicker to make predictions. The performance of predictions from this API can be seen on our project documentation website. It is less accurate than the slow API. This API is a more 'traditional' style of API.