CW-2249-new-schemes

epi2me-labs · Jul 31, 2023 · 7a7279d · 7a7279d
1 parent 98641c5
commit 7a7279d
Show file tree

Hide file tree

Showing 12 changed files with 1,302 additions and 76 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -6,60 +6,36 @@ variables:
     NF_WORKFLOW_OPTS: "--fastq test_data/fastq --samples test_data/sample_sheet.csv"
     NF_IGNORE_PROCESSES: "checkSampleSheet,combineGenotypeSummaries,genotypeSummary,report_no_data"
     CI_FLAVOUR: "new"
-    # IMAGE_TAG: "NO_UPDATE"
 
-# check-versions:
-#   stage: pre-custom-builds
-#   image: conda/miniconda3:latest
-#   needs: []
-#   script:
-#     - apt-get update && apt-get -y install gcc
-#     - pip install cachetools==4.1.0 docker==4.2.0 PyGithub==1.54 requests==2.20.0 semver==2.13.0 packaging
-#     - python3 bin/check_aux_versions.py --docker_registry 'ontresearch/pangolin' --github_repository 'cov-lineages/pangolin' --token ${LABS_BOT_GH_TOKEN} --tool pangolin -p v > ./pangolin.txt
-#     - python3 bin/check_aux_versions.py --docker_registry 'ontresearch/nextclade' --github_repository 'nextstrain/nextclade' --token ${LABS_BOT_GH_TOKEN} --tool nextclade > ./nextclade.txt
-#   artifacts:
-#     paths:
-#       - pangolin.txt
-#       - nextclade.txt
-#     expire_in: 1 day
-#   rules:
-#       - if: '$CI_PIPELINE_SOURCE == "schedule"'
-#
-# .release:
-#     stage: custom-builds
-#     needs: ['check-versions']
-#     before_script:
-#         - export IMAGE_TAG=`cat ${FILE_NAME} | grep ACTION | cut -f2 -d":"`
-#         - if [ "${IMAGE_TAG}" == "NO_UPDATE" ]; then echo "${IMAGE_NAME} already at latest version"; exit 0; fi;
-#         - IMAGE_DOCKERFILE="${IMAGE_DOCKERFILE:-Dockerfile}"
-#         - if [ -z "${IMAGE_NAME}" ]; then "IMAGE_NAME was not set"; exit 1; fi;
-#         - if [ -z "${IMAGE_TAG}" ]; then "IMAGE_TAG was not set"; exit 1; fi;
-#         - echo "Building auxiliary container with:"
-#         - echo " - Dockerfile:'${IMAGE_DOCKERFILE}'"
-#         - echo " - Image name:'${IMAGE_NAME}'"
-#     script:
-#         - !reference [.install, glibc-alpine]
-#         - !reference [.install, awscli]
-#         - echo ${DOCKERHUB_TOKEN} | docker login --username epi2melabs --password-stdin
-#         - LATEST="${DOCKERHUB_NAMESPACE}/${IMAGE_NAME}:${IMAGE_TAG}"
-#         - BUILD_CMD="docker build --no-cache -t ${LATEST} -f ${IMAGE_DOCKERFILE} . --build-arg IMAGE_TAG=${IMAGE_TAG}"
-#         - echo "${BUILD_CMD}"
-#         - ${BUILD_CMD}
-#         - docker push ${LATEST}
-#     rules:
-#         - if: '$CI_PIPELINE_SOURCE == "schedule"'
-#           when: on_success
-#
-# release-pangolin:
-#   extends: .release
-#   variables:
-#     IMAGE_NAME: "pangolin"
-#     FILE_NAME: "pangolin.txt"
-#     IMAGE_DOCKERFILE: "Dockerfile_pangolin"
-#
-# release-nextclade:
-#   extends: .release
-#   variables:
-#       IMAGE_NAME: "nextclade"
-#       FILE_NAME: "nextclade.txt"
-#       IMAGE_DOCKERFILE: "Dockerfile_nextclade"
+
+docker-run:
+    artifacts:
+        when: always
+        exclude:
+          - ${CI_PROJECT_NAME}/workspace/**/*
+    parallel:
+        matrix:
+            - MATRIX_NAME: ["Midnight-ONT/V1", "Midnight-ONT/V2", "ARTIC/V4.1", "ARTIC/V5.3.2", "Midnight-IDT/V2", "NEB-VarSkip/v2"]
+    rules:
+        - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
+          when: never
+        - if: $MATRIX_NAME == "Midnight-ONT/V1"
+          variables:
+              NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version Midnight-ONT/V1"
+        - if: $MATRIX_NAME == "Midnight-ONT/V2"
+          variables:
+              NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version Midnight-ONT/V2"
+        - if: $MATRIX_NAME == "ARTIC/V4.1"
+          variables:
+              NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version ARTIC/V4.1"
+        - if: $MATRIX_NAME == "ARTIC/V5.3.2"
+          variables:
+              NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version ARTIC/V5.3.2"
+        - if: $MATRIX_NAME == "Midnight-IDT/V2"
+          variables:
+              NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version Midnight-IDT/V2"
+        - if: $MATRIX_NAME == "NEB-VarSkip/v2"
+          variables:
+              NF_WORKFLOW_OPTS: " --fastq test_data/fastq --samples test_data/sample_sheet.csv --scheme_name SARS-CoV-2 --scheme_version NEB-VarSkip/v2"
+
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.3.30]
+### Added
+- New primer schemes; ARTIC V5.3.2 and Midnight IDT V2
+### Updated
+- Nextclade & pangolin
+- For alt primers we take the largest region (Updated ARTIC code merges a canonical primer site with an alt site, producing an interval that encompasses both)
+
 ## [v0.3.29]
 ### Changed
 - Example command to use demo data.

diff --git a/bin/workflow_glue/__init__.py b/bin/workflow_glue/__init__.py
@@ -13,13 +13,23 @@
 
 def get_components():
     """Find a list of workflow command scripts."""
+    logger = get_main_logger(_package_name)
     path = os.path.dirname(os.path.abspath(__file__))
     components = list()
     for fname in glob.glob(os.path.join(path, "*.py")):
         name = os.path.splitext(os.path.basename(fname))[0]
         if name in ("__init__", "util"):
             continue
-        mod = importlib.import_module(f"{_package_name}.{name}")
+
+        # leniently attempt to import module
+        try:
+            mod = importlib.import_module(f"{_package_name}.{name}")
+        except ModuleNotFoundError as e:
+            # if imports cannot be satisifed, refuse to add the component
+            # rather than exploding
+            logger.warn(f"Could not load {name} due to missing module {e.name}")
+            continue
+
         # if theres a main() and and argparser() thats good enough for us.
         try:
             req = "main", "argparser"

diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py
@@ -1,10 +1,35 @@
 """Check if a sample sheet is valid."""
+import codecs
 import csv
+import os
 import sys
 
 from .util import get_named_logger, wf_parser  # noqa: ABS101
 
 
+# Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my
+# comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8
+# I should add). If we do not handle this with the correct encoding, the mark will
+# appear in the parsed data, causing the header to be malformed.
+# See CW-2310
+def determine_codec(f):
+    """Peek at a file and return an appropriate reading codec."""
+    with open(f, 'rb') as f_bytes:
+        # Could use chardet here if we need to expand codec support
+        initial_bytes = f_bytes.read(8)
+
+        for codec, encoding_name in [
+            [codecs.BOM_UTF8, "utf-8-sig"],  # use the -sig codec to drop the mark
+            [codecs.BOM_UTF16_BE, "utf-16"],  # don't specify LE or BE to drop mark
+            [codecs.BOM_UTF16_LE, "utf-16"],
+            [codecs.BOM_UTF32_BE, "utf-32"],  # handle 32 for completeness
+            [codecs.BOM_UTF32_LE, "utf-32"],  # again skip LE or BE to drop mark
+        ]:
+            if initial_bytes.startswith(codec):
+                return encoding_name
+        return None  # will cause file to be opened with default encoding
+
+
 def main(args):
     """Run the entry point."""
     logger = get_named_logger("checkSheet")
@@ -14,10 +39,15 @@ def main(args):
     sample_types = []
     allowed_sample_types = [
         "test_sample", "positive_control", "negative_control", "no_template_control"
-        ]
+    ]
+
+    if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet):
+        sys.stdout.write(f"Could not open sample sheet '{args.sample_sheet}'.")
+        sys.exit()
 
     try:
-        with open(args.sample_sheet, "r") as f:
+        encoding = determine_codec(args.sample_sheet)
+        with open(args.sample_sheet, "r", encoding=encoding) as f:
             csv_reader = csv.DictReader(f)
             n_row = 0
             for row in csv_reader:

diff --git a/data/primer_schemes/SARS-CoV-2/ARTIC/V2/SARS-CoV-2.scheme.bed b/data/primer_schemes/SARS-CoV-2/ARTIC/V2/SARS-CoV-2.scheme.bed
@@ -32,7 +32,7 @@ MN908947.3	4636	4658	nCoV-2019_16_LEFT	2	+
 MN908947.3	4995	5017	nCoV-2019_16_RIGHT	2	-
 MN908947.3	4939	4966	nCoV-2019_17_LEFT	1	+
 MN908947.3	5296	5321	nCoV-2019_17_RIGHT	1	-
-MN908947.3	5257	5287	nCoV-2019_18_LEFT_alt2	2	+
+MN908947.3	5257	5287	nCoV-2019_18_LEFT	2	+
 MN908947.3	5620	5644	nCoV-2019_18_RIGHT	2	-
 MN908947.3	5563	5586	nCoV-2019_19_LEFT	1	+
 MN908947.3	5932	5957	nCoV-2019_19_RIGHT	1	-