diff --git a/.github/workflows/conventional-prs.yaml b/.github/workflows/conventional-prs.yaml
new file mode 100644
index 00000000..210988fa
--- /dev/null
+++ b/.github/workflows/conventional-prs.yaml
@@ -0,0 +1,18 @@
+name: "Lint PR for conventional commits: https://www.conventionalcommits.org"
+
+on:
+ pull_request_target:
+ types:
+ - opened
+ - reopened
+ - edited
+ - synchronize
+
+jobs:
+ main:
+ name: Validate PR title
+ runs-on: ubuntu-latest
+ steps:
+ - uses: amannn/action-semantic-pull-request@v5
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 00000000..e4614386
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,58 @@
+name: Tests
+
+on:
+ push:
+ branches: [ main ]
+ pull_request:
+ branches_ignore: []
+
+
+jobs:
+ Formatting:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ # Full git history is needed to get a proper
+ # list of changed files within `super-linter`
+ fetch-depth: 0
+ - name: Formatting
+ uses: github/super-linter@v4
+ env:
+ VALIDATE_ALL_CODEBASE: false
+ DEFAULT_BRANCH: main
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ VALIDATE_SNAKEMAKE_SNAKEFMT: true
+
+ Linting:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - name: Lint workflow
+ uses: snakemake/snakemake-github-action@v1
+ with:
+ directory: .
+ snakefile: workflow/Snakefile
+ stagein: "mamba install -y -n snakemake --channel conda-forge pyarrow=6.0"
+ args: "--lint"
+
+ Testing:
+ runs-on: ubuntu-latest
+ needs:
+ - Linting
+ - Formatting
+ steps:
+ - uses: actions/checkout@v3
+ - name: Test workflow
+ uses: snakemake/snakemake-github-action@v1
+ with:
+ directory: .test
+ snakefile: workflow/Snakefile
+ args: "--configfile .test/config/config.yaml --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache"
+
+ - name: Test report
+ uses: snakemake/snakemake-github-action@v1
+ with:
+ directory: .test
+ snakefile: workflow/Snakefile
+ args: "--configfile .test/config/config.yaml --report report.zip"
diff --git a/.github/workflows/release-please.yaml b/.github/workflows/release-please.yaml
new file mode 100644
index 00000000..3dd2575f
--- /dev/null
+++ b/.github/workflows/release-please.yaml
@@ -0,0 +1,15 @@
+name: "release-please, see: https://github.com/marketplace/actions/release-please-action"
+
+on:
+ push:
+ branches:
+ - main
+
+jobs:
+ release-please:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: google-github-actions/release-please-action@v3
+ with:
+ release-type: simple
+ token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..c123a441
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+.test/benchmarks/**
+.test/logs/**
+.test/resources/*.fai
+.test/resources/*.gz
+.test/resources/*.mmi
+.test/results/**
+.test/.snakemake/**
+benchmarks/**
+logs/**
+resources/**
+!resources/datavzrd/**
+!resources/summary_plot.json
+results/**
+.snakemake/**
\ No newline at end of file
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
new file mode 100644
index 00000000..3e72af79
--- /dev/null
+++ b/.test/config/config.yaml
@@ -0,0 +1,40 @@
+samples: config/samples.tsv
+units: config/units.tsv
+
+reference:
+ # NOTE: ANY CHANGE OF SPECIES, RELEASE OR BUILD IN THIS TESTING
+ # CONFIG.YAML REQUIRES ADJUSTING THE FILENAME OF RESOURCE/*.FASTA
+ # Ensembl species name
+ species: homo_sapiens
+ # Genome build
+ build: GRCh38
+ # Ensembl release
+ release: 107
+ # for available downloads, please browse either of these views:
+ # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasets.html
+ # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasetsAlt.html
+ repeat_masker_download_link: "http://www.repeatmasker.org/genomes/hg38/RepeatMasker-rm406-dfam2.0/hg38.fa.out.gz"
+
+
+cyrcular:
+ # minimum read depth (used during covered segment identification)
+ min_read_depth: 2
+ # minimum number of split reads (edges with less than this number will be removed from the graph)
+ min_split_reads: 5
+ # maximum number of plausible circular paths generated for each strongly connected component of the graph
+ max_paths_per_component: 15
+ # maximum deletion length which is encoded as an edge / a breakend event. Can be useful for unmapped regions in the reference.
+ max_deletion_length: 10000
+
+filter:
+ # varlociraptor fdr control
+ fdr-control:
+ threshold: 0.1
+ mode: local-smart
+ events:
+ circular:
+ varlociraptor:
+ - present
+ circles:
+ min-length: 100
+ max-length: 50_000_000
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
new file mode 100644
index 00000000..675ffa0c
--- /dev/null
+++ b/.test/config/samples.tsv
@@ -0,0 +1,2 @@
+group sample_name platform
+example example nanopore
diff --git a/.test/config/units.tsv b/.test/config/units.tsv
new file mode 100644
index 00000000..0d68373b
--- /dev/null
+++ b/.test/config/units.tsv
@@ -0,0 +1,2 @@
+sample_name unit_name fq1 fq2
+example 0 data/example_reads.fastq.gz
diff --git a/.test/data/example_reads.fastq.gz b/.test/data/example_reads.fastq.gz
new file mode 100644
index 00000000..cc1f575a
Binary files /dev/null and b/.test/data/example_reads.fastq.gz differ
diff --git a/resources/example_ref.fasta b/.test/resources/homo_sapiens.GRCh38.107.fasta
similarity index 71%
rename from resources/example_ref.fasta
rename to .test/resources/homo_sapiens.GRCh38.107.fasta
index 6476a91a..d42c3915 100644
--- a/resources/example_ref.fasta
+++ b/.test/resources/homo_sapiens.GRCh38.107.fasta
@@ -1,4 +1,113 @@
->M
+>2
+AGGCTGTGACAGTCATCTGTCTGGACGCGCTGGGTGGATGCGGGGGGCTCCTGGGAACTG
+TGTTGGAGCCGAGCAAGCGCTAGCCAGGCGCAAGCGCGCACAGACTGTAGCCATCCGAGG
+ACACCCCCGCCCCCCCGGCCCACCCGGAGACACCCGCGCAGAATCGCCTCCGGATCCCCT
+GCAGTCGGCGGGAGGTAAGGAGCAGGGCTTGCAAACCGCCCGGCGCCCAGGGAAGCGACG
+AGCGCCGGGGCAAGGCAAGCCCTGGACGGGATTGCGACGTGCGCACCGGGCGCCCTAATA
+TGCCCGGGGGACTGTTTCTGCTTCCGAAACAAAACCATCTCTGGGTTTTCCCAGAAAAGC
+CAGTTCCAGCCCCGAAGGCATCCTGGCTAGAGGAGACCCGCCCTAATCCTTTTGCAGCCC
+TTACCGGGGGGAGTAATGGCTTCTGCGAAAAGAAATTCCCTCGGCTCTAGAAGATCTGTC
+TGTGTTTGAGCTGTCGGAGAGCCGGTGCGTCCCCACCCCAGGCTGGGGTTCTTCTCCAAA
+GGGTGCCCCTGGAGGAAGAAGAGGGGGGGATTAGGCAGGGCGAGGCCGCCGCGGTCGCAA
+TCTGGGTCACGGCTGCTCCAGCTTGGAGGAGAGGCGGCTCTCCCGGCGACCCTCCTCGCG
+CGGGCGCCCCTGCCATTCCCGGGAACAGGGGCTCAGCCTCTCCCTCCCTGGAAGAGGACG
+TTGTCGTGGGTTTGGAAGAGCAGGGGTGGGCTTAGAGAGCTTCCAATTAAGCTATTGGCA
+GGAGTATCCCTGCAGCGGGTGAATGCCGAGGGGCGTTTGCTCAAATTTGGGGAGGGGAAG
+GATTTGTGGATATGGGTGTCTGTTGTTGGTCTCTGTCTAGAGAAAGGCTTTTTTTTATTT
+GCAAAGTTTTCTAAATCCCCTGCTATCATTTGCACTCCTGAGGTTGCATTTTTACAAAGG
+GGGTAGAAGGTACTCCAAATACCATTCCCGGTAGCTGGGTCGGAGAGCCTGGGGCTTCCC
+CTGAGCAGCCGGCCCCACACCGCTGCGAGTGCGGTTGTCTGCGTGCTCGTGAGAGCTAGA
+ATTCTGCAGCCAGGAACAGCCCCCTCCCCCAGGCAGTGCCTTGTGTGAATGAAATGGCAG
+TTTCCAAAGTTGCGGAGCCTCGCCACCACCCCCTGCATCTGCATGCCCCCTCCCACCCCC
+TGTCGTAGACAGCTTGTACACAAAAGGAGGGCGGGAGGGAGGGAGCGAGAGGCACAACTT
+CCTCCACCTTCGGGAGCAGTGGGCAGAGTGGGGGGCTTGGAGGGAAGATTGGGGAACCTG
+GTTAGAGGGGGCGCCCATTGCCTATCCCCTCGGTCTGCCCCGTTTGCCCACCCTCTCCGG
+TGTGTCTGTCGGTTGCAGTGTTGGAGGTCGGCGCCGGCCCCCGCCTTCCGCGCCCCCCAC
+GGGAAGGAAGCACCCCCGGTATTAAAACGAACGGGGCGGAAAGAAGCCCTCAGTCGCCGG
+CCGGGAGGCGAGCCGATGCCGAGCTGCTCCACGTCCACCATGCCGGGCATGATCTGCAAG
+AACCCAGACCTCGAGTTTGACTCGCTACAGCCCTGCTTCTACCCGGACGAAGATGACTTC
+TACTTCGGCGGCCCCGACTCGACCCCCCCGGGGGAGGACATCTGGAAGAAGTTTGAGCTG
+CTGCCCACGCCCCCGCTGTCGCCCAGCCGTGGCTTCGCGGAGCACAGCTCCGAGCCCCCG
+AGCTGGGTCACGGAGATGCTGCTTGAGAACGAGCTGTGGGGCAGCCCGGCCGAGGAGGAC
+GCGTTCGGCCTGGGGGGACTGGGTGGCCTCACCCCCAACCCGGTCATCCTCCAGGACTGC
+ATGTGGAGCGGCTTCTCCGCCCGCGAGAAGCTGGAGCGCGCCGTGAGCGAGAAGCTGCAG
+CACGGCCGCGGGCCGCCAACCGCCGGTTCCACCGCCCAGTCCCCGGGAGCCGGCGCCGCC
+AGCCCTGCGGGTCGCGGGCACGGCGGGGCTGCGGGAGCCGGCCGCGCCGGGGCCGCCCTG
+CCCGCCGAGCTCGCCCACCCGGCCGCCGAGTGCGTGGATCCCGCCGTGGTCTTCCCCTTT
+CCCGTGAACAAGCGCGAGCCAGCGCCCGTGCCCGCAGCCCCGGCCAGTGCCCCGGCGGCG
+GGCCCTGCGGTCGCCTCGGGGGCGGGTATTGCCGCCCCAGCCGGGGCCCCGGGGGTCGCC
+CCTCCGCGCCCAGGCGGCCGCCAGACCAGCGGCGGCGACCACAAGGCCCTCAGTACCTCC
+GGAGAGGACACCCTGAGCGATTCAGGTAAAGACCGAACTCGGGTCCGGCTGCCTCCCTGG
+GGCACTGGACCCCGGGTCGCGTCCCCTTTGTTAGTGCTCGTATGTCTTGGCCTGGGGAGC
+ATTTTGGAGGCAGTGCTAGGGGCAGAGAGGTCCTGTTTCCCCCAAGTCTCTCCTCGGGGT
+AAAGAGAAGGGGCTGAGAGAATGCCGTTGCAAAAGGGGTGCTCTCCAATTCTCGCCTTCA
+CTAAAGTTCCTTCCACCCTCTCCTGGGGAGCCCTCCTCTAGGCCATCACGGGCCCTCACC
+CGGTCCCCCACCTCTCTTTTGCAGCGCAGTCTGAGGAATAAAATTGGAGAAAGTTGGTGG
+CTAAACCGGGTGGGGGTTTAGGGGGTTGCTGGGTGCACTGCCTGGACAGAAACCTGTTAG
+CGCAGGGGTGAAAGGGACTCTCTGGCCCAGGTCAGGGGAGGGAAAGACATCCCGAGAAGA
+TTCAAGGGCTGTGCAAAGCCCTGTTTAAGGCGCAGGAACTTATAGGAGGGTTGCACAGAT
+GGCTAGAGCCGATTTTCTATTCTTTTTCTTTTTCTTTTTTTTTTTTTTTCAAATGTCGGT
+ACCTTTCCCTTCCCCCATCCTCGGTGGGTGGTGGGCTATTTGCTCCTGGTGCGTGGCCAG
+CAGGCGGCGATATGCGAGGCCAGCAGGCGGGCCCGGGATCTGAAAGGCTGGGGGTGGTGG
+GGGCACCCTCCCTCCCTCCATTCAGCAGCTGGCTGCAAGTGCAACAGCAGTTGTGTACAT
+TCTCAGGGGGCCTCCTCTTTCCAGTGTGCAGTGGAAACTGGCTGTAGTTTTGTCTTCCAG
+CCTGAATTCCAGGCCTAATTTGAGATGTGAGTTGTATCTGTAACCCAGTGCCCTTGAAGG
+TGAGGGCAGGCACTCAGCAGCCTCTCCAGGAAGGCTCACATCCTGGGAGGACTCACTGAT
+TAGTTCTATTGTGTTCATTTGTCTGTGTCTTAAGCTGAAGGGAAGAGTTAAAACCAAGCC
+TTTCCCTGGGGGTCTGGATGAACAGAACTCAACCCAAAGAGTGGCATTGCCTTGTCCTTG
+GAGCAGGGAGCTGGGACCCCCCTTGGACTTTGAAAACCAGTGTTTTCAGAATGCAGGTGG
+ATAACAAGCCTAAATTTACTTCTGGGCTGAGGAGAGATCTTTGAGGCTCCTGGAAGGAAA
+CTTGGTGATAAGCCTCCAGTTTGAAACGGCTCTGTCCCTTTAATGTCTGTGCCTTGACAG
+CTTTTGGTGAGGAAGCACTTCCTTCCAACAGCTGTCTTCTTGGCAGAAAACCAAAACATT
+GGCTTAAAGGGACCCACAGACTGGAACAGCCTCACATTTCGGCTTTAGAACAAATCCCAC
+AATTGTTCAGCTTTCCGGTCCCCTTCAGATCAAGCAGAAGATATGTTTTGATTTTCATGC
+TTGTATTTTAAACAATAATTTTCTACCCCAGCGTGGTAGTCAATGAGGAGAGAGGGGAAG
+AATGCGCACATGATGCTACACGTTTCTGTTGTTGCTGTTATTATTGGTGGCTTTGAGGAG
+AGCTGCTCCCATTTGGGGGTTTATACCAACTGTGGATTATGGCTTTGTCATTAAGATTTG
+ATCTTTGTTAAATGAAAAACTGTTTATTGTATAAAACTCAGGTTTGTGGACGAAAAGTTG
+TTTTTTTTCTTCAGTTAATTAAATTGTTCCTCAAGTTTGTTTAAGGACTTAAAATCAAAC
+ACAACCATGTGTAAACTGCTAAATGAGGCTCCTAAAATGAGAGGCCTCAACTCTTTAAGT
+GTGGAGCTAGAAATGTAAATAAGTCCACAGGGCAGACTGGTGATTATGATAAAAGCTACC
+ATTTACTGAGCATCTGTCTACTAGGCTCAGCTCTATGCTAAGTCTACATGTTATCTGTCA
+AAGTGGTATCATCCCCATTTAATAGCTGAGGAAACAGAGGCTTAGAAAGGCTGGGTAACT
+TGACCAGGGTCATGCAACTAGTCTGCGGTGGAGCCAGGATTCTGTCTGACCCTAAAGGCC
+AAGTTCTTTATATTTATTTCTACCACCTGCTAAAGTCTTGAATGGAGGCTGAAAGCACAG
+TTGGGGTATGGGGAAGAAAAATATATATACATACATATATGTATATGTATGTATGTATGT
+ATGGGGGGTTGTTTTGTTTTTGTTTTTGATAAGGAGTTTTGCTCTTGTTGCCCAGGCTGG
+AGTGCAGTGGTATGATCTGGGCTCACTGCAACCTCCGCCTCCCGGGTTCAAGTCATTCTC
+CTGCCTCAGCCTCCCGAGTAGCTGGGATTACCGGAGCATGCCACCACACCCAGCAAAGTT
+TTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGATCTTGAACTCCTCA
+TCTCAGGTGATCTGCCCGCCTCCGCTTCCCAAAGTGCTGGGATTACAGGTGTGAGTCACC
+GCGTCCGGCCTACAGATATATTTAATTTAAAGAGATCTAAAACAAATACAAAACTGTCCA
+CATCTATGTTGATGGACCCATAAAAATAGCAGTCTGCCAGGGTCTGCCGGAAGAGACAGA
+TAAGCATACATATTAACATGGATATATATGTGAATTTCATTCAAATGGTTCTCACATGAG
+AGTAACTAGCATCTTTCTCTCAGATGATGAAGATGATGAAGAGGAAGATGAAGAGGAAGA
+AATCGACGTGGTCACTGTGGAGAAGCGGCGTTCCTCCTCCAACACCAAGGCTGTCACCAC
+ATTCACCATCACTGTGCGTCCCAAGAACGCAGCCCTGGGTCCCGGGAGGGCTCAGTCCAG
+CGAGCTGATCCTCAAACGATGCCTTCCCATCCACCAGCAGCACAACTATGCCGCCCCCTC
+TCCCTACGTGGAGAGTGAGGATGCACCCCCACAGAAGAAGATAAAGAGCGAGGCGTCCCC
+ACGTCCGCTCAAGAGTGTCATCCCCCCAAAGGCTAAGAGCTTGAGCCCCCGAAACTCTGA
+CTCGGAGGACAGTGAGCGTCGCAGAAACCACAACATCCTGGAGCGCCAGCGCCGCAACGA
+CCTTCGGTCCAGCTTTCTCACGCTCAGGGACCACGTGCCGGAGTTGGTAAAGAATGAGAA
+GGCCGCCAAGGTGGTCATTTTGAAAAAGGCCACTGAGTATGTCCACTCCCTCCAGGCCGA
+GGAGCACCAGCTTTTGCTGGAAAAGGAAAAATTGCAGGCAAGACAGCAGCAGTTGCTAAA
+GAAAATTGAACACGCTCGGACTTGCTAGACGCTTCTCAAAACTGGACAGTCACTGCCACT
+TTGCACATTTTGATTTTTTTTTTAAACAAACATTGTGTTGACATTAAGAATGTTGGTTTA
+CTTTCAAATCGGTCCCCTGTCGAGTTCGGCTCTGGGTGGGCAGTAGGACCACCAGTGTGG
+GGTTCTGCTGGGACCTTGGAGAGCCTGCATCCCAGGATGCTGGGTGGCCCTGCAGCCTCC
+TCCACCTCACCTCCATGACAGCGCTAAACGTTGGTGACGGTTGGGAGCCTCTGGGGCTGT
+TGAAGTCACCTTGTGTGTTCCAAGTTTCCAAACAACAGAAAGTCATTCCTTCTTTTTAAA
+ATGGTGCTTAAGTTCCAGCAGATGCCACATAAGGGGTTTGCCATTTGATACCCCTGGGGA
+ACATTTCTGTAAATACCATTGACACATCCGCCTTTTGTATACATCCTGGGTAATGAGAGG
+TGGCTTTTGCGGCCAGTATTAGACTGGAAGTTCATACCTAAGTACTGTAATAATACCTCA
+ATGTTTGAGGAGCATGTTTTGTATACAAATATATTGTTAATCTCTGTTATGTACTGTACT
+AATTCTTACACTGCCTGTATACTTTAGTATGACGCTGATACATAACTAAATTTGATACTT
+ATATTTTCGTATGAAAATGAGTTGTGAAAGTTTTGAGTAGATATTACTTTATCACTTTTT
+GAACTAAGAAACTTTTGTAAAGAAATTTACTATATATATATGCCTTTTTCCTAGCCTGTT
+TCTTCCTGTTAATGTATTTGTTCATGTTTGGTGCATAGAACTGGGTAAATGCAAAGTTCT
+GTGTTTAATTTCTTCAAAATGTATATATTTAGTGCTGCATCTTATAGCACTTTGAAATAC
+CTCATGTTTATGAAAATAAATAGCTTAAAATTAAATGA
+>MT
GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTT
CGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTC
GCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATT
diff --git a/.test/resources/scenarios/illumina_circle_scenario.yaml b/.test/resources/scenarios/illumina_circle_scenario.yaml
new file mode 120000
index 00000000..9e071a00
--- /dev/null
+++ b/.test/resources/scenarios/illumina_circle_scenario.yaml
@@ -0,0 +1 @@
+../../../resources/scenarios/illumina_circle_scenario.yaml
\ No newline at end of file
diff --git a/.test/resources/scenarios/nanopore_circle_scenario.yaml b/.test/resources/scenarios/nanopore_circle_scenario.yaml
new file mode 120000
index 00000000..7c897dfa
--- /dev/null
+++ b/.test/resources/scenarios/nanopore_circle_scenario.yaml
@@ -0,0 +1 @@
+../../../resources/scenarios/nanopore_circle_scenario.yaml
\ No newline at end of file
diff --git a/.test/resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml b/.test/resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml
new file mode 120000
index 00000000..35e2e1ae
--- /dev/null
+++ b/.test/resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml
@@ -0,0 +1 @@
+../../../resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml
\ No newline at end of file
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..4a1bffa7
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,45 @@
+cff-version: 1.2.0
+message: "If you use this workflow, please cite the article from preferred-citation and the workflow itself."
+authors:
+- family-names: "Hartmann"
+ given-names: "Till"
+ orcid: "https://orcid.org/0000-0002-6993-347X"
+- family-names: "Lähnemann"
+ given-names: "David"
+ orcid: "https://orcid.org/0000-0002-9138-4112"
+title: "snakemake-workflows/cyrcular-calling"
+version: 1.2.0
+doi: Zenodo DOI TBD
+date-released: 2023-04-06
+url: "https://github.com/snakemake-workflows/cyrcular-calling"
+preferred-citation:
+ type: article
+ authors:
+ - family-names: "Tüns"
+ given-names: "Alicia Isabell"
+ orcid: "https://orcid.org/0000-0003-0025-1304"
+ - family-names: "Hartmann"
+ given-names: "Till"
+ orcid: "https://orcid.org/0000-0002-6993-347X"
+ - family-names: "Magin"
+ given-names: "Simon"
+ - family-names: "Chamorro González"
+ given-names: "Rocío"
+ orcid: "https://orcid.org/0000-0002-5653-7259"
+ - family-names: "Henssen"
+ given-names: "Anton George"
+ orcid: "https://orcid.org/0000-0003-1534-778X"
+ - family-names: "Rahmann"
+ given-names: "Sven"
+ orcid: "https://orcid.org/0000-0002-8536-6065"
+ - family-names: "Schramm"
+ given-names: "Alexander"
+ orcid: "https://orcid.org/0000-0001-7670-7529"
+ - family-names: "Köster"
+ given-names: "Johannes"
+ orcid: "https://orcid.org/0000-0001-9818-9320"
+ doi: "10.3389/fgene.2022.867018"
+ journal: "Frontiers in Genetics"
+ month: 05
+ year: 2022
+ title: "Detection and Validation of Circular DNA Fragments Using Nanopore Sequencing"
diff --git a/README.md b/README.md
index 5d7bcdad..ac81ccef 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
# Snakemake workflow: cyrcular-calling
[![Snakemake](https://img.shields.io/badge/snakemake-≥6.15.0-brightgreen.svg)](https://snakemake.github.io)
-
+[![Tests](https://github.com/snakemake-workflows/cyrcular-calling/actions/workflows/main.yaml/badge.svg)](https://github.com/snakemake-workflows/cyrcular-calling/actions/workflows/main.yaml)
+[![Conventional Commits](https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=white)](https://conventionalcommits.org)
A Snakemake workflow for ecDNA detection in nanopore sequencing reads derived from DNA samples enriched for circular DNA.
diff --git a/config/config.yaml b/config/config.yaml
index eb59b0aa..0868ac4e 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,23 +1,20 @@
-calling:
- samples: config/samples.tsv
- units: config/units.tsv
+samples: config/samples.tsv
+units: config/units.tsv
- reference:
- # if `path` is not set, name should just be "genome" atm.
- name: "example_ref"
- # Custom path overrides everything below that, if file exists
- # (in that case, path must match "resources/" + name + ".fasta")
- path: "resources/example_ref.fasta"
- # The first n entries of the FASTA will be considered.
- n_chromosomes: 25
- # Ensembl species name
- species: homo_sapiens
- # Ensembl release
- release: 100
- # Genome build
- build: GRCh38
+reference:
+ # Ensembl species name
+ species: homo_sapiens
+ # Genome build
+ build: GRCh38
+ # Ensembl release
+ release: 107
+ # for available downloads, please browse either of these views:
+ # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasets.html
+ # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasetsAlt.html
+ repeat_masker_download_link: "http://www.repeatmasker.org/genomes/hg38/RepeatMasker-rm406-dfam2.0/hg38.fa.out.gz"
+cyrcular:
# minimum read depth (used during covered segment identification)
min_read_depth: 2
# minimum number of split reads (edges with less than this number will be removed from the graph)
@@ -27,12 +24,15 @@ calling:
# maximum deletion length which is encoded as an edge / a breakend event. Can be useful for unmapped regions in the reference.
max_deletion_length: 10000
- filter:
- # varlociraptor fdr control
- fdr-control:
- threshold: 0.1
- local: true
- events:
- circular:
- varlociraptor:
- - present
+filter:
+ # varlociraptor fdr control
+ fdr-control:
+ threshold: 0.1
+ mode: local-smart
+ events:
+ circular:
+ varlociraptor:
+ - present
+ circles:
+ min-length: 100
+ max-length: 50_000_000
diff --git a/config/samples.tsv b/config/samples.tsv
index 83b46ad0..b1c90139 100644
--- a/config/samples.tsv
+++ b/config/samples.tsv
@@ -1,3 +1 @@
-group sample platform
-Example Example nanopore
-#Example Example_Illumina illumina
+group sample_name platform
diff --git a/config/units.tsv b/config/units.tsv
index 9e486578..2cc2b5d4 100644
--- a/config/units.tsv
+++ b/config/units.tsv
@@ -1,3 +1 @@
-sample unit fq1 fq2
-Example 0 resources/examples/example_reads.fastq.gz
-#Example_illumina 0 /path/to/Example_illumina.1.fastq /path/to/Example_illumina.2.fastq
+sample_name unit_name fq1 fq2
diff --git a/resources/examples/example_reads.fastq.gz b/resources/examples/example_reads.fastq.gz
deleted file mode 100644
index 21534a9c..00000000
Binary files a/resources/examples/example_reads.fastq.gz and /dev/null differ
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 21dd27e2..9eb38fe6 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -12,12 +12,11 @@ scattergather:
include: "rules/common.smk"
-include: "rules/utils.smk"
+include: "rules/ref.smk"
include: "rules/map.smk"
include: "rules/call.smk"
include: "rules/annotate.smk"
include: "rules/filter.smk"
-include: "rules/table.smk"
include: "rules/plot.smk"
include: "rules/datavzrd.smk"
diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml
index 655c1073..0d0e8da5 100644
--- a/workflow/envs/bcftools.yaml
+++ b/workflow/envs/bcftools.yaml
@@ -2,4 +2,4 @@ channels:
- conda-forge
- bioconda
dependencies:
- - bcftools =1.12
+ - bcftools =1.16
diff --git a/workflow/envs/blast.yaml b/workflow/envs/blast.yaml
deleted file mode 100644
index f6d24bac..00000000
--- a/workflow/envs/blast.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
- - bioconda
- - conda-forge
-dependencies:
- - blast =2.11
diff --git a/workflow/envs/cyrcular.post-deploy.sh b/workflow/envs/cyrcular.post-deploy.sh
index b082ae71..fe44fa11 100644
--- a/workflow/envs/cyrcular.post-deploy.sh
+++ b/workflow/envs/cyrcular.post-deploy.sh
@@ -1 +1,6 @@
-cargo install --git https://github.com/tedil/cyrcular.git --branch main --rev ec9fc44682134fad6cc3730eacfe3860cacf6dc3 --locked
+# this is needed to have the `ruget` binary installed by `plotly_kaleido`
+# available in the PATH, it is removed from PATH after installation of cyrcular
+PATH="$HOME/.cargo/bin:$PATH"
+# TODO: merge branch `annotate` in cyrcular and create bioconda recipe
+cargo install --git https://github.com/tedil/cyrcular.git --branch annotate --rev 44ba5274b9f2aee7d193b774343acaccf0edec53 --locked --root "${CONDA_PREFIX}"
+PATH=${PATH#[^:]*:}
diff --git a/workflow/envs/cyrcular.yaml b/workflow/envs/cyrcular.yaml
index da398401..90287a7f 100644
--- a/workflow/envs/cyrcular.yaml
+++ b/workflow/envs/cyrcular.yaml
@@ -2,7 +2,7 @@ channels:
- conda-forge
- bioconda
dependencies:
- - rust =1.58.1
+ - rust =1.67.1
- git =2.35.0
- compilers =1.3.0
- pkg-config =0.29.2
@@ -11,7 +11,7 @@ dependencies:
- gsl =2.7
- libcblas =3.9.0
- openssl =3.0.0
- - zlib =1.2.11
+ - zlib >=1.2.12,<1.3
- bzip2 =1.0.8
- xz =5.2.5
- clangdev =13.0.0
diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml
index ae558e60..abcf0e78 100644
--- a/workflow/envs/datavzrd.yaml
+++ b/workflow/envs/datavzrd.yaml
@@ -1,4 +1,4 @@
channels:
- conda-forge
dependencies:
- - datavzrd =1.16
\ No newline at end of file
+ - datavzrd =2.18
diff --git a/workflow/envs/excel.yaml b/workflow/envs/excel.yaml
deleted file mode 100644
index 22cac73c..00000000
--- a/workflow/envs/excel.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
- - conda-forge
- - bioconda
-dependencies:
- - openpyxl =3.0.7
- - pandas =1.2.5
diff --git a/workflow/envs/gff.yaml b/workflow/envs/gff.yaml
deleted file mode 100644
index 1d6e7dba..00000000
--- a/workflow/envs/gff.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-channels:
- - conda-forge
- - bioconda
-dependencies:
- - bcftools =1.13
- - pysam
- - gff3sort
- - pigz =2.6
diff --git a/workflow/envs/graphviz.yaml b/workflow/envs/graphviz.yaml
index 288eafe8..7fee320c 100644
--- a/workflow/envs/graphviz.yaml
+++ b/workflow/envs/graphviz.yaml
@@ -1,6 +1,5 @@
channels:
- - bioconda
- conda-forge
dependencies:
- - graphviz =2.49.0
+ - graphviz =7.1
- imagemagick =7.1
diff --git a/workflow/envs/nanosim.yaml b/workflow/envs/nanosim.yaml
deleted file mode 100644
index 72c8b779..00000000
--- a/workflow/envs/nanosim.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
- - bioconda
- - conda-forge
-dependencies:
- - nanosim ==3.0
diff --git a/workflow/envs/canu.yaml b/workflow/envs/pandas.yaml
similarity index 57%
rename from workflow/envs/canu.yaml
rename to workflow/envs/pandas.yaml
index 75637a5b..3b4a8191 100644
--- a/workflow/envs/canu.yaml
+++ b/workflow/envs/pandas.yaml
@@ -1,5 +1,4 @@
channels:
- - bioconda
- conda-forge
dependencies:
- - canu =2.1.1
+ - pandas =2.0
diff --git a/workflow/envs/bash.yaml b/workflow/envs/pigz.yaml
similarity index 74%
rename from workflow/envs/bash.yaml
rename to workflow/envs/pigz.yaml
index e32d4a8b..35aa8a3c 100644
--- a/workflow/envs/bash.yaml
+++ b/workflow/envs/pigz.yaml
@@ -1,4 +1,4 @@
channels:
- conda-forge
dependencies:
- - bash =5.1
+ - pigz =2.6
diff --git a/workflow/envs/flye.yaml b/workflow/envs/python.yaml
similarity index 57%
rename from workflow/envs/flye.yaml
rename to workflow/envs/python.yaml
index f0560735..b4f05da5 100644
--- a/workflow/envs/flye.yaml
+++ b/workflow/envs/python.yaml
@@ -1,5 +1,4 @@
channels:
- - bioconda
- conda-forge
dependencies:
- - flye =2.8.3
+ - python =3.11
diff --git a/workflow/envs/rbt.yaml b/workflow/envs/rbt.yaml
index c1a22ee7..9041876b 100644
--- a/workflow/envs/rbt.yaml
+++ b/workflow/envs/rbt.yaml
@@ -1,6 +1,6 @@
channels:
- - bioconda
- conda-forge
+ - bioconda
dependencies:
- - rust-bio-tools =0.39.1
- - bcftools =1.14
+ - rust-bio-tools =0.42
+ - bcftools =1.16
diff --git a/workflow/envs/samtools.yaml b/workflow/envs/samtools.yaml
deleted file mode 100644
index 3a89d373..00000000
--- a/workflow/envs/samtools.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
- - bioconda
- - conda-forge
-dependencies:
- - samtools =1.12
diff --git a/workflow/envs/table.yaml b/workflow/envs/table.yaml
deleted file mode 100644
index 8d237599..00000000
--- a/workflow/envs/table.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-channels:
- - conda-forge
- - bioconda
-dependencies:
- - pandas >=1.3.4
- - pandarallel >=1.5.4
- - dinopy >=2.2.0
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
index 3b89eff9..9455e39c 100644
--- a/workflow/envs/varlociraptor.yaml
+++ b/workflow/envs/varlociraptor.yaml
@@ -1,4 +1,6 @@
channels:
- conda-forge
+ - bioconda
dependencies:
- - varlociraptor =5.2.0
+ - varlociraptor =7.0
+ - bcftools =1.16
\ No newline at end of file
diff --git a/workflow/envs/vcf_annotate.yaml b/workflow/envs/vcf_annotate.yaml
index 29f9b102..4fc6aa00 100644
--- a/workflow/envs/vcf_annotate.yaml
+++ b/workflow/envs/vcf_annotate.yaml
@@ -2,7 +2,5 @@ channels:
- conda-forge
- bioconda
dependencies:
- - bcftools =1.15
- - htslib =1.15
- - vembrane =0.12
+ - bcftools =1.16
- ripgrep =13.0
diff --git a/workflow/envs/vembrane.yaml b/workflow/envs/vembrane.yaml
deleted file mode 100644
index 84517e33..00000000
--- a/workflow/envs/vembrane.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
- - conda-forge
- - bioconda
-dependencies:
- - vembrane =0.7
- - bcftools =1.12
diff --git a/workflow/envs/wget.yaml b/workflow/envs/wget.yaml
index 7a924a6b..7672f299 100644
--- a/workflow/envs/wget.yaml
+++ b/workflow/envs/wget.yaml
@@ -1,4 +1,4 @@
channels:
- conda-forge
dependencies:
- - wget =1.20.3
+ - wget =1.20
diff --git a/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml b/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml
index 55da5f25..277f1301 100644
--- a/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml
+++ b/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml
@@ -1,100 +1,173 @@
name: Circle calls
__definitions__:
- - |
- event_link = f"""
- function(value, row) {{
- var filename = value.replace("_circle_", "_");
- var link = `${{value}}`;
- return link;
- }}
- """
- - |
- graph_link = f"""
- function(value, row) {{
- var filename = value;
- var link = `${{value}}`;
- return link;
- }}
- """
+ # TODO: once yaml-specified parameters become available for custom-path
+ # functions in datavzrd, move breakpoint_link to separate:
+ # workflow/scripts/breakpoint_link_formatter.js
- |
- gene_link = f"""
+ breakpoint_link = f"""
function(value, row) {{
- $(function () {{
- $('[data-toggle="tooltip"]').tooltip()
- }});
- let genes = value.split(";");
- let rows = "";
- for (let g of genes) {{
- rows = `${{rows}}
${{g}} |
`;
- }};
- let table = ``;
- let button = ``;
- return button;
+ let primer_blast_url = `https://www.ncbi.nlm.nih.gov/tools/primer-blast/index.cgi?LINK_LOC=bookmark&INPUT_SEQUENCE=${{value}}&PRIMER5_END=900&PRIMER3_START=1100&PRIMER_PRODUCT_MIN=200&PRIMER_PRODUCT_MAX=1200&ORGANISM={config["reference"]["species"].replace("_", "%20")}&NEWWIN=on&NEWWIN=on&SHOW_SVIEWER=true`;
+ let seq = "…" + value.substring(value.length / 2 - 10, value.length / 2 + 10) + "…";
+ let link = `${{seq}}`;
+ return link;
}}
"""
+ # TODO: once yaml-specified parameters become available for custom-path
+ # functions in datavzrd, move regions_links to separate:
+ # workflow/scripts/regions_links_formatter.js
- |
- breakpoint_link = f"""
+ regions_links = f"""
function(value, row) {{
- var primer_blast_url = `https://www.ncbi.nlm.nih.gov/tools/primer-blast/index.cgi?LINK_LOC=bookmark&INPUT_SEQUENCE=${{value}}&PRIMER5_END=900&PRIMER3_START=1100&PRIMER_PRODUCT_MIN=200&PRIMER_PRODUCT_MAX=1200&ORGANISM=Homo%20sapiens&NEWWIN=on&NEWWIN=on&SHOW_SVIEWER=true`;
- var seq = "…" + value.substring(value.length / 2 - 10, value.length / 2 + 10) + "…";
- var link = `${{seq}}`;
- return link;
+ let regions = value.split(",");
+ let links = [];
+ for (let region of regions) {{
+ let ensembl_url = `https://www.ensembl.org/{config["reference"]["species"]}/Location/Overview?r=${{region}}`;
+ let link = `${{region}}`;
+ links.push(link);
+ }}
+ let result = links.join(", ");
+ return result;
}}
"""
+default-view: "summary-plot"
+
datasets:
- ?for group, path in zip(params.groups, params.calls):
- ?f"{group}-circles":
+ ?for category, path in params.overview_tables:
+ ?f"data-{category}-circles":
+ path: ?path
+ separator: "\t"
+ links:
+ circle-details:
+ column: event_id
+ view: ?f"detail-{wildcards.group}-{{value}}-segments"
+ ?for group, circle, path in params.detail_tables:
+ ?f"data-{group}-{circle}-segments":
path: ?path
- separator: ","
+ separator: "\t"
+ "overview-table":
+ path: ?input.categorized_overview_table
+ separator: "\t"
+ links:
+ link to category:
+ column: category
+ view: "circles-{value}"
+ # link to circle:
+ # column: event_id
+ # table-row: ?f"data-{CATEGORY}-circles/event_id"
views:
- ?for group in params.groups:
- ?f"circles-{group}":
- desc: |
- Overview table showing all discovered circles for a group.
- dataset: ?f"{group}-circles"
+ ?for category in params.categories:
+ ?f"circles-{category}":
+ desc: ?f"""Overview table showing all discovered circles for category `{category}`.\n
+
+ | Column | Description |\n
+ | --------------------- | ------------------------------------------------------------------------------- |\n
+ | `event_id` | Id of the event, format `{{graph_id}}-{{circle_id}}` |\n
+ | `graph_id` | Id of the graph the circle belongs to |\n
+ | `circle_id` | Id of the circle in its parent graph |\n
+ | `segment_count` | Number of segments with coverage |\n
+ | `regions` | Genomic regions of only those segments with coverage |\n
+ | `num_exons` | Number of exons in coverage segments |\n
+ | `gene_names` | Genes overlapping with coverage segments |\n
+ | `regulatory_features` | Regulatory features overlapping with coverage segments |\n
+ | `repeats` | Repeats known to repeatMasker overlapping with coverage segments |\n
+ | `num_split_reads` | Total number of split reads on all split edges of the circle |\n
+ | `prob_present` | Probability of a circle being present as calculated by varlociraptor |\n
+ | `prob_absent` | Probability of a circle being absent as calculated by varlociraptor |\n
+ | `prob_artifact` | Probability of a circle being deemed an artefact as calculated by varlociraptor |\n
+ | `af_*` | Allele frequency of the circle for the respective sample |\n
+ | `category` | The circle's category, either of `coding`, `regulatory` or `intronic` |\n
+ | last column | Link to a detailed view of all of the circle's segments |\n
+
+ """
+ dataset: ?f"data-{category}-circles"
+ render-table:
+ columns:
+ "graph_id":
+ custom-path: ?input.graph_link_formatter
+ "circle_id":
+ custom-path: ?input.circle_qc_plot_link_formatter
+ "regions":
+ custom: ?regions_links
+ "regex('prob_(.+)')":
+ plot:
+ heatmap:
+ scale: linear
+ domain: [0.0, 1.0]
+ range:
+ - white
+ - "#c1d7a8"
+ "gene_names":
+ custom-path: ?input.gene_card_link_formatter
+ "gene_ids":
+ display-mode: hidden
+ "regex('af_(.+)')":
+ plot:
+ ticks:
+ scale: "linear"
+ domain: [0.0, 1.0]
+ aux-domain-columns:
+ - "regex('af_(.+)')"
+
+ ?for group, circle, _ in params.detail_tables:
+ ?f"detail-{group}-{circle}-segments":
+ hidden: true
+ desc: ?f"""Detail table containing all covered segments and circle junctions for circle **{circle}** of group *{group}*.\n
+
+ | Column | Description |\n
+ | --------------------- | ------------------------------------------------------------------------------- |\n
+ | `graph_id` | Id of the graph the circle belongs to |\n
+ | `circle_id` | Id of the circle in its parent graph |\n
+ | `kind` | Whether the edge is a `coverage` or a `split` edge |\n
+ | `target_from` | The name of the sequence from which the edge starts |\n
+ | `from` | The genomic locus in `target_from` from which the edge starts |\n
+ | `target_to` | The name of the sequence at which the edge ends |\n
+ | `to` | The genomic locus in `target_to` at which the edge ends |\n
+ | `length` | Length of the edge in bp, if applicable |\n
+ | `num_exons` | Number of exons in a coverage segment |\n
+ | `gene_names` | Genes overlapping with a coverage segment |\n
+ | `regulatory_features` | Regulatory features overlapping with a coverage segment |\n
+ | `repeats` | Repeats known to repeatMasker overlapping with a coverage segment |\n
+ | `coverage` | Mean read depth for the edge in the circle |\n
+ | `num_split_reads` | Number of split reads for the edge in the circle |\n
+ | `breakpoint_sequence` | Link to a NCBI primer-blast with form's fields pre-filled |\n
+
+ """
+ dataset: ?f"data-{group}-{circle}-segments"
render-table:
- direction:
- display-mode: hidden
- "regex('AF_(.+)')":
- plot:
- ticks:
- scale: "linear"
- domain: [0.0, 1.0]
- aux-domain-columns:
- - "regex('AF_(.+)')"
- "circle_length":
- plot:
- ticks:
- scale: "linear"
- aux-domain-columns:
- - "circle_length"
- "length":
- plot:
- ticks:
- scale: "linear"
- aux-domain-columns:
- - "length"
- "regex('PROB_(.+)')":
- plot:
- heatmap:
- scale: linear
- domain: [0.0, 1.0]
- range:
- - white
- - "#c1d7a8"
- "GENES":
- custom: ?gene_link
- "breakpoint_seq":
- custom: ?breakpoint_link
- "EVENT":
- custom: ?event_link
- "graph":
- custom: ?graph_link
- "NUM_EXONS":
- display-mode: hidden
+ columns:
+ "graph_id":
+ custom-path: ?input.graph_link_formatter
+ "circle_id":
+ custom-path: ?input.circle_qc_plot_link_formatter
+ "kind":
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: accent
+ "breakpoint_sequence":
+ custom: ?breakpoint_link
+ "gene_names":
+ custom-path: ?input.gene_card_link_formatter
+ "gene_ids":
+ display-mode: hidden
+
+ "summary-plot":
+ desc: |
+ Circle length distributions by category.
+ Click on any of the bars to get to the respective category's overview table (or choose directly from the "views" dropdown menu).
+
+ Categories are defined as follows:
+ | Category | Condition |
+ | ------------ | --------------------------------------------------------------- |
+ | `coding` | at least 1 exon |
+ | `regulatory` | not `coding` and at least 1 regulatory feature annotated |
+ | `intronic` | neither `coding` nor `regulatory` and at least 1 gene annotated |
+ | `other` | none of the above |
+
+ dataset: ?f"overview-table"
+ render-plot:
+ spec-path: ?input.summary_spec
\ No newline at end of file
diff --git a/workflow/resources/datavzrd/summary_plot.json b/workflow/resources/datavzrd/summary_plot.json
new file mode 100644
index 00000000..fd020c6e
--- /dev/null
+++ b/workflow/resources/datavzrd/summary_plot.json
@@ -0,0 +1,130 @@
+{
+ "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
+ "autosize": {
+ "type": "fit",
+ "contains": "padding"
+ },
+ "title": {
+ "text": "Number of circles and circle lengths by category"
+ },
+ "transform": [
+ {
+ "calculate": "indexof(['coding', 'regulatory', 'intronic', 'other'], datum.category)",
+ "as": "order"
+ }
+ ],
+ "vconcat": [
+ {
+ "width": 800,
+ "resolve": {
+ "scale": {
+ "x": "independent",
+ "y": "independent"
+ }
+ },
+ "mark": {
+ "type": "bar",
+ "tooltip": true
+ },
+ "encoding": {
+ "x": {
+ "field": "category",
+ "aggregate": "count",
+ "stack": true,
+ "title": "number of circles"
+ },
+ "color": {
+ "field": "category",
+ "sort": [
+ "coding",
+ "regulatory",
+ "intronic",
+ "other"
+ ],
+ "scale": {
+ "type": "ordinal",
+ "scheme": "set1",
+ "domain": [
+ "coding",
+ "regulatory",
+ "intronic",
+ "other"
+ ]
+ }
+ },
+ "order": {
+ "field": "order",
+ "type": "nominal"
+ },
+ "href": {
+ "field": "link to category"
+ }
+ }
+ },
+ {
+ "mark": {
+ "type": "bar"
+ },
+ "height": 150,
+ "width": 200,
+ "resolve": {
+ "scale": {
+ "x": "shared",
+ "y": "shared"
+ }
+ },
+ "encoding": {
+ "x": {
+ "bin": {
+ "minstep": 500,
+ "maxbins": 100
+ },
+ "field": "circle_length",
+ "type": "quantitative"
+ },
+ "y": {
+ "title": "circle count",
+ "aggregate": "count",
+ "type": "quantitative"
+ },
+ "color": {
+ "field": "category",
+ "type": "nominal",
+ "sort": [
+ "coding",
+ "regulatory",
+ "intronic",
+ "other"
+ ],
+ "scale": {
+ "type": "ordinal",
+ "scheme": "set1",
+ "domain": [
+ "coding",
+ "regulatory",
+ "intronic",
+ "other"
+ ]
+ }
+ },
+ "column": {
+ "field": "category",
+ "type": "nominal",
+ "sort": [
+ "coding",
+ "regulatory",
+ "intronic",
+ "other"
+ ]
+ },
+ "order": {
+ "field": "order",
+ "type": "nominal"
+ },
+ "href": {
+ "field": "link to category"
+ }
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index c2c461be..65b56a54 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -1,101 +1,118 @@
-rule extract_vcf_header_lines_for_bcftools_annotate:
+## annotate genes, exons and breakpoint sequences; produce one overview table containing circles, and one table with details of each segment for each circle
+
+
+rule cyrcular_generate_tables:
input:
- vcf="results/calling/candidates/{sample}.sorted.bcf",
+ reference=rules.get_genome.output.genome,
+ graph="results/calling/graphs/{group}.annotated.graph",
+ bcf="results/calling/calls/filtered_fdr/reheader/{group}.bcf",
output:
- header=temp("results/calling/annotation/{sample}.header_lines.txt"),
- params:
- fields="|".join(CYRCULAR_INFO_FIELDS),
- conda:
- "../envs/vcf_annotate.yaml"
+ overview="results/calling/tables/{group}/{group}_overview.tsv",
+ details=directory("results/calling/tables/{group}/{group}_details/"),
+ threads: 1
log:
- "logs/re-annotate/header_{sample}.log",
+ "logs/cyrcular_generate_tables/{group}.log",
+ benchmark:
+ "benchmarks/cyrcular_generate_tables/{group}.txt"
+ conda:
+ "../envs/cyrcular.yaml"
shell:
- """
- bcftools view -h {input.vcf} | rg {params.fields:q} > {output.header} 2> {log}
- """
+ """cyrcular graph table {input.graph} {input.bcf} --reference {input.reference} --circle-table {output.overview} --segment-tables {output.details} 2> {log}"""
-rule copy_annotation_from_cyrcular:
+rule cyrcular_annotate_graph:
input:
- variants="results/calling/calls/pre_annotated/{sample}.bcf",
- variants_index="results/calling/calls/pre_annotated/{sample}.bcf.csi",
- candidates_with_annotation="results/calling/candidates/{sample}.sorted.bcf",
- candidates_with_annotation_index="results/calling/candidates/{sample}.sorted.bcf.csi",
- header_lines="results/calling/annotation/{sample}.header_lines.txt",
+ reference=rules.get_genome.output.genome,
+ graph="results/calling/graphs/{group}.graph",
+ gene_annotation="resources/gene_annotation.gff3.gz",
+ regulatory_annotation="resources/regulatory_annotation.gff3.gz",
+ repeat_annotation=lambda wc: "resources/repeat_masker.fa.out.gz"
+ if config["reference"].get("repeat_masker_download_link", "")
+ else "",
output:
- variants="results/calling/calls/annotated/{sample}.bcf",
- annotation=temp("results/calling/candidates/{sample}.sorted.bcf.tab"),
- annotation_bgzip=temp("results/calling/candidates/{sample}.sorted.bcf.tab.bgz"),
- annotation_bgzip_tabix=temp(
- "results/calling/candidates/{sample}.sorted.bcf.tab.bgz.tbi"
- ),
+ annotated="results/calling/graphs/{group}.annotated.graph",
+ threads: 1
log:
- "logs/re-annotate/{sample}.log",
+ "logs/cyrcular_annotate_graph/{group}.log",
benchmark:
- "benchmarks/re-annotate/{sample}.txt"
+ "benchmarks/cyrcular_annotate_graph/{group}.txt"
conda:
- "../envs/vcf_annotate.yaml"
+ "../envs/cyrcular.yaml"
params:
- header=copy_annotation_vembrane_header_expr(),
- table_expr=copy_annotation_table_expr(),
- columns=copy_annotation_bcftools_annotate_columns(),
+ repeat_annotation=lambda wc, input: f" --repeat-annotation {input.repeat_annotation} "
+ if config["reference"].get("repeat_masker_download_link", "")
+ else "",
+ shell:
+ "cyrcular graph annotate "
+ " --reference {input.reference} "
+ " --gene-annotation {input.gene_annotation} "
+ " --regulatory-annotation {input.regulatory_annotation} "
+ " --output {output.annotated} "
+ " {input.graph} "
+ "2> {log} "
+
+
+rule reheader_filtered_bcf:
+ input:
+ bcf="results/calling/calls/filtered_fdr/{group}.bcf",
+ sorted_header="results/calling/calls/filtered_fdr/reheader/{group}.header.sorted.txt",
+ output:
+ bcf="results/calling/calls/filtered_fdr/reheader/{group}.bcf",
+ log:
+ "logs/reheader_filtered_bcf/{group}.log",
+ conda:
+ "../envs/bcftools.yaml"
shell:
+ ## bcftools re-header seems to re-order entries
+ # bcftools reheader --header {input.sorted_header} --output {output.bcf} {input.bcf}
+ ## so we have to re-header ourselves
+ ## TODO: reinvestigate to find a cleaner solution
"""
- vembrane table --header {params.header:q} {params.table_expr:q} {input.candidates_with_annotation} > {output.annotation} 2> {log}
- bgzip -c {output.annotation} > {output.annotation_bgzip} 2>> {log}
- tabix -p vcf --zero-based -S 1 -f {output.annotation_bgzip} 2>> {log}
- bcftools annotate --header-lines {input.header_lines} --annotations {output.annotation_bgzip} --columns {params.columns} --output-type b --output {output.variants} {input.variants} 2>> {log}
+ cat {input.sorted_header} <(bcftools view -H {input.bcf}) | bcftools view -Ob > {output.bcf} 2> {log}
"""
-rule annotate_genes:
+rule sort_bcf_header:
input:
- annotation="resources/gencode.v38.annotation.sorted.gff3.gz",
- variants="results/calling/calls/merged/{sample}.bcf",
+ bcf="results/calling/calls/filtered_fdr/{group}.bcf",
+ header="results/calling/calls/filtered_fdr/{group}.header.txt",
output:
- variants="results/calling/calls/pre_annotated/{sample}.bcf",
- threads: 1
+ sorted_header="results/calling/calls/filtered_fdr/reheader/{group}.header.sorted.txt",
log:
- "logs/annotate_genes/{sample}.log",
- benchmark:
- "benchmarks/annotate_genes/{sample}.txt"
+ "logs/sort_bcf_header/{group}.log",
conda:
- "../envs/gff.yaml"
+ "../envs/python.yaml"
script:
- "../scripts/gff_annotate.py"
+ "../scripts/sort_bcf_header.py"
-rule sort_annotation:
+rule get_bcf_header:
input:
- "resources/gencode.v38.annotation.gff3.gz",
+ bcf="{file}.bcf",
output:
- gff="resources/gencode.v38.annotation.sorted.gff3.gz",
- tbi="resources/gencode.v38.annotation.sorted.gff3.gz.tbi",
- csi="resources/gencode.v38.annotation.sorted.gff3.gz.csi",
- conda:
- "../envs/gff.yaml"
+ header="{file}.header.txt",
log:
- "logs/sort_annotation/gencode.v38.annotation.gff3.log",
- benchmark:
- "benchmarks/sort_annotation/gencode.v38.annotation.gff3.txt"
- threads: 48
+ "logs/get_bcf_header/{file}.log",
+ conda:
+ "../envs/bcftools.yaml"
shell:
"""
- gff3sort.pl <(pigz -dc {input}) | bgzip -@ {threads} -c > {output.gff} 2> {log}
- tabix {output.gff} 2>> {log}
- tabix --csi {output.gff} 2>> {log}
+ bcftools view -h {input.bcf} > {output.header} 2> {log}
"""
-rule download_annotation:
+rule extract_vcf_header_lines_for_bcftools_annotate:
+ input:
+ vcf="results/calling/candidates/{sample}.sorted.bcf",
output:
- "resources/gencode.v38.annotation.gff3.gz",
- log:
- "logs/download_annotation/gencode.v38.annotation.gff3.log",
- benchmark:
- "benchmarks/download_annotation/gencode.v38.annotation.gff3.txt"
- cache: True
+ header=temp("results/calling/annotation/{sample}.header_lines.txt"),
+ params:
+ fields="|".join(CYRCULAR_INFO_FIELDS),
conda:
- "../envs/wget.yaml"
+ "../envs/vcf_annotate.yaml"
+ log:
+ "logs/re-annotate/header_{sample}.log",
shell:
- """wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz --no-check-certificate -O {output} 2> {log}"""
+ """
+ bcftools view -h {input.vcf} | rg {params.fields:q} > {output.header} 2> {log}
+ """
diff --git a/workflow/rules/call.smk b/workflow/rules/call.smk
index 532b5a9b..92cb9b58 100644
--- a/workflow/rules/call.smk
+++ b/workflow/rules/call.smk
@@ -10,7 +10,7 @@ rule bcf_index:
benchmark:
"benchmarks/bcftools/index/{prefix}.txt"
wrapper:
- "v1.0.0/bio/bcftools/index"
+ "v1.25.0/bio/bcftools/index"
rule bcftools_concat:
@@ -32,7 +32,7 @@ rule bcftools_concat:
uncompressed_bcf=False,
extra="-a", # optional parameters for bcftools concat (except -o)
wrapper:
- "v1.0.0/bio/bcftools/concat"
+ "v1.25.0/bio/bcftools/concat"
rule bcftools_sort:
@@ -40,13 +40,12 @@ rule bcftools_sort:
"results/calling/calls/initial/{group}.{scatteritem}.bcf",
output:
"results/calling/calls/initial_sorted/{group}.{scatteritem}.bcf",
- threads: 2
log:
"logs/sort_calls/{group}.{scatteritem}.log",
benchmark:
"benchmarks/sort_calls/{group}.{scatteritem}.txt"
wrapper:
- "v1.0.0/bio/bcftools/sort"
+ "v1.25.0/bio/bcftools/sort"
rule varlociraptor_call:
@@ -72,8 +71,8 @@ rule varlociraptor_call:
rule varlociraptor_alignment_properties:
input:
- ref=config["calling"]["reference"]["path"],
- ref_idx=config["calling"]["reference"]["path"] + ".fai",
+ ref=rules.get_genome.output.genome,
+ ref_idx=rules.genome_faidx.output.index,
bam="results/calling/mapping/{sample}.bam",
output:
"results/calling/alignment-properties/{sample}.json",
@@ -87,8 +86,8 @@ rule varlociraptor_alignment_properties:
rule varlociraptor_preprocess:
input:
- ref=config["calling"]["reference"]["path"],
- ref_idx=config["calling"]["reference"]["path"] + ".fai",
+ ref=rules.get_genome.output.genome,
+ ref_idx=rules.genome_faidx.output.index,
candidates=get_group_candidates,
bam="results/calling/mapping/{sample}.bam",
bai="results/calling/mapping/{sample}.bam.bai",
@@ -135,20 +134,20 @@ rule sort_bnd_bcfs:
"results/calling/candidates/{sample}.bcf",
output:
"results/calling/candidates/{sample}.sorted.bcf",
- threads: 2
log:
"logs/sort_bnds/{sample}.log",
benchmark:
"benchmarks/sort_bnds/{sample}.txt"
wrapper:
- "v1.0.0/bio/bcftools/sort"
+ "v1.25.0/bio/bcftools/sort"
rule circle_bnds:
input:
bam="results/calling/mapping/{sample}.bam",
bai="results/calling/mapping/{sample}.bam.bai",
- ref=config["calling"]["reference"]["path"],
+ ref=rules.get_genome.output.genome,
+ ref_index=rules.genome_faidx.output.index,
output:
bnds="results/calling/candidates/{sample}.bcf",
graph="results/calling/graphs/{sample}.graph",
@@ -159,10 +158,10 @@ rule circle_bnds:
"benchmarks/cyrcular/{sample}.txt"
threads: 4
params:
- min_read_depth=config["calling"]["min_read_depth"], # 2
- min_split_reads=config["calling"]["min_split_reads"], # 5
- max_paths_per_component=config["calling"]["max_paths_per_component"], # 15
- max_deletion_length=config["calling"]["max_deletion_length"], # 10000,
+ min_read_depth=config["cyrcular"]["min_read_depth"], # 2
+ min_split_reads=config["cyrcular"]["min_split_reads"], # 5
+ max_paths_per_component=config["cyrcular"]["max_paths_per_component"], # 15
+ max_deletion_length=config["cyrcular"]["max_deletion_length"], # 10000,
conda:
"../envs/cyrcular.yaml"
shell:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index de0d6f17..2c32dcde 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -2,31 +2,25 @@ import pandas as pd
import re
from snakemake.utils import validate
-REFERENCE = config["calling"]["reference"]["name"]
-
validate(config, schema="../schemas/config.schema.yaml")
-wildcard_constraints:
- sample="[a-zA-Z_0-9-]+",
-
-
def read_samples():
samples = (
pd.read_csv(
- config["calling"]["samples"],
+ config["samples"],
sep="\t",
- dtype={"sample": str, "group": str},
+ dtype={"sample_name": str, "group": str},
comment="#",
)
- .set_index("sample", drop=False)
+ .set_index("sample_name", drop=False)
.sort_index()
)
def _group_or_sample(row):
group = row.get("group", None)
if pd.isnull(group):
- return row["sample"]
+ return row["sample_name"]
return group
samples["group"] = [_group_or_sample(row) for _, row in samples.iterrows()]
@@ -35,18 +29,25 @@ def read_samples():
samples = read_samples()
+SAMPLES = list(sorted(set(samples["sample_name"])))
GROUPS = list(sorted(set(samples["group"])))
+CATEGORIES = ["coding", "regulatory", "intronic", "other"]
+
+
+wildcard_constraints:
+ sample="|".join(SAMPLES),
+ group="|".join(GROUPS),
def read_units():
units = (
pd.read_csv(
- config["calling"]["units"],
+ config["units"],
sep="\t",
- dtype={"sample": str, "unit": str},
+ dtype={"sample_name": str, "unit_name": str},
comment="#",
)
- .set_index(["sample", "unit"], drop=False)
+ .set_index(["sample_name", "unit_name"], drop=False)
.sort_index()
)
validate(units, schema="../schemas/units.schema.yaml")
@@ -58,12 +59,17 @@ units = read_units()
def get_all_input(wildcards):
targets = []
+ targets += expand("results/datavzrd-report/{group}.fdr-controlled", group=GROUPS)
+ targets += expand(
+ "results/tmp/{group}.{category}.qc_plots.marker",
+ group=GROUPS,
+ category=CATEGORIES,
+ )
targets += expand(
- "results/calling/tables/{group}.sorted.annotated.csv", group=GROUPS
+ "results/tmp/{group}.{category}.graph_plots.marker",
+ group=GROUPS,
+ category=CATEGORIES,
)
- targets += ["results/datavzrd-report/all.fdr-controlled"]
- targets += expand("results/tmp/{group}.qc_plots.marker", group=GROUPS)
- targets += expand("results/tmp/{group}.graph_plots.marker", group=GROUPS)
return targets
@@ -82,17 +88,17 @@ def get_group_candidates(wildcards):
scenario = scenario_name(wildcards)
if scenario == "nanopore_only":
sample = list(
- samples.query(f"group == '{group}' & platform == 'nanopore'")["sample"]
+ samples.query(f"group == '{group}' & platform == 'nanopore'")["sample_name"]
)[0]
return f"results/calling/candidate-calls/{sample}.{{scatteritem}}.bcf"
elif scenario == "illumina_only":
sample = list(
- samples.query(f"group == '{group}' & platform == 'illumina'")["sample"]
+ samples.query(f"group == '{group}' & platform == 'illumina'")["sample_name"]
)[0]
return f"results/calling/candidate-calls/{sample}.{{scatteritem}}.bcf"
elif scenario == "nanopore_with_illumina_support":
sample = list(
- samples.query(f"group == '{group}' & platform == 'nanopore'")["sample"]
+ samples.query(f"group == '{group}' & platform == 'nanopore'")["sample_name"]
)[0]
return f"results/calling/candidate-calls/{sample}.{{scatteritem}}.bcf"
else:
@@ -122,17 +128,17 @@ def get_observations(wildcards):
observations = []
- has_nanopore = len(s.query("platform == 'nanopore'")["sample"]) > 0
- has_illumina = len(s.query("platform == 'illumina'")["sample"]) > 0
+ has_nanopore = len(s.query("platform == 'nanopore'")["sample_name"]) > 0
+ has_illumina = len(s.query("platform == 'illumina'")["sample_name"]) > 0
if has_nanopore:
- for sample_nanopore in list(s.query("platform == 'nanopore'")["sample"]):
+ for sample_nanopore in list(s.query("platform == 'nanopore'")["sample_name"]):
observations.append(
f"results/calling/calls/observations/{sample_nanopore}.{{scatteritem}}.bcf"
)
if has_illumina:
- for sample_illumina in list(s.query("platform == 'illumina'")["sample"]):
+ for sample_illumina in list(s.query("platform == 'illumina'")["sample_name"]):
observations.append(
f"results/calling/calls/observations/{sample_illumina}.{{scatteritem}}.bcf"
)
@@ -211,9 +217,7 @@ def get_fastqs(wildcards):
# black wasn't happy about the inline version of this in the params section
def varlociraptor_filtering_mode(wildcards):
- return (
- "--local" if config["calling"]["filter"]["fdr-control"]["local"] is True else ""
- )
+ return config["filter"]["fdr-control"].get("mode", "local-smart")
class Region:
@@ -263,15 +267,27 @@ def parse_bnd_alt(s: str):
CYRCULAR_INFO_FIELDS = ["CircleLength", "CircleSegmentCount", "SplitReads", "Support"]
-def copy_annotation_table_expr():
- return "CHROM,POS,ID,REF,ALT," + ",".join(
- map(lambda s: f"INFO['{s}']", CYRCULAR_INFO_FIELDS)
- )
-
-
-def copy_annotation_vembrane_header_expr():
- return "CHROM,POS,ID,REF,ALT," + ",".join(CYRCULAR_INFO_FIELDS)
-
-
-def copy_annotation_bcftools_annotate_columns():
- return "CHROM,POS,~ID,REF,ALT," + ",".join(CYRCULAR_INFO_FIELDS)
+def get_detail_tables_group_circle_path_for_report(wildcards, input):
+ from pathlib import Path
+
+ group = wildcards.group
+ res = []
+ folder = input.detail_tables
+ group_tsv = pd.read_csv(input.categorized_overview_table, sep="\t")
+ keep_event_ids = set(group_tsv["event_id"])
+ detail_table_files = [f for f in os.listdir(folder) if f.endswith(".tsv")]
+ event_ids = [
+ f"{m.group(1)}-{m.group(2)}"
+ for m in [
+ re.match("graph_([^_]+)_circle_([^_]+)_segments.tsv", path)
+ for path in detail_table_files
+ ]
+ ]
+ for event_id, detail_file in zip(event_ids, detail_table_files):
+ if event_id not in keep_event_ids:
+ continue
+ f = pd.read_csv(f"{folder}/{detail_file}", sep="\t")
+ if f.empty:
+ continue
+ res.append((group, event_id, f"{folder}/{detail_file}"))
+ return res
diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
index 3e3c9975..5c7dc5e7 100644
--- a/workflow/rules/datavzrd.smk
+++ b/workflow/rules/datavzrd.smk
@@ -1,17 +1,44 @@
+from pathlib import Path
+
+
+# TODO: rethink overview_tables and detail_tables
+# I am not really sure how to do this well, but I think we could have
+# just one circle-centered and one segment-centered table each, with
+# proper filtering (probably also in the linkouts from the overview plot_
+# and with current plot linkouts for graph and circle as collapsed /
+# hidden plots right there in the table. We'll need to discuss.
rule render_datavzrd_config:
input:
template=workflow.source_path(
"../resources/datavzrd/circle-table-template.datavzrd.yaml"
),
+ summary_spec=workflow.source_path("../resources/datavzrd/summary_plot.json"),
+ categorized_overview_table="results/calling/tables/{group}/{group}_categorized_overview.tsv",
+ overview_tables=expand(
+ "results/calling/tables/{{group}}/{{group}}_overview.{category}.tsv",
+ category=CATEGORIES,
+ ),
+ detail_tables="results/calling/tables/{group}/{group}_details/",
+ circle_qc_plot_link_formatter=workflow.source_path(
+ "../scripts/circle_qc_plot_link_formatter.js"
+ ),
+ graph_link_formatter=workflow.source_path("../scripts/graph_link_formatter.js"),
+ gene_card_link_formatter=workflow.source_path(
+ "../scripts/gene_card_link_formatter.js"
+ ),
output:
- "resources/datavzrd/all.datavzrd.yaml",
+ "results/datavzrd/{group}.datavzrd.yaml",
params:
- groups=lambda wc: GROUPS,
- calls=lambda wc: [
- f"results/calling/tables/{group}.sorted.annotated.csv" for group in GROUPS
+ categories=CATEGORIES,
+ overview_tables=lambda wc, input: [
+ (file.split("_overview.")[1].replace(".tsv", ""), file)
+ for file in list(input.overview_tables)
],
+ detail_tables=lambda wc, input: get_detail_tables_group_circle_path_for_report(
+ wc, input
+ ),
log:
- "logs/datavzrd_render/all.log",
+ "logs/datavzrd_render/{group}.log",
template_engine:
"yte"
@@ -19,62 +46,59 @@ rule render_datavzrd_config:
rule copy_qc_plots_for_datavzrd:
input:
plots="results/calling/coverage_graphs/{group}",
- report="results/datavzrd-report/all.fdr-controlled",
+ overview="results/calling/tables/{group}/{group}_overview.{category}.tsv",
+ report="results/datavzrd-report/{group}.fdr-controlled",
output:
- marker="results/tmp/{group}.qc_plots.marker",
+ marker="results/tmp/{group}.{category}.qc_plots.marker",
params:
output_dir=lambda wc: directory(
- f"results/datavzrd-report/all.fdr-controlled/circles-{wc.group}/qc_plots"
+ f"results/datavzrd-report/{wc.group}.fdr-controlled/qc_plots"
),
log:
- "logs/datavzrd/copy_qc_plots/{group}.log",
+ "logs/datavzrd/copy_qc_plots/{group}.{category}.log",
conda:
- "../envs/bash.yaml"
- shell:
- """
- mkdir -p {params.output_dir} 2> {log}
- for f in `find {input.plots} -regex '.*/graph_[0-9]+_[0-9]+\.html'`; do ( cp "$f" {params.output_dir} ); done
- touch {output.marker} 2>> {log}
- """
+ "../envs/pandas.yaml"
+ script:
+ "../scripts/copy_qc_plots.py"
rule copy_graph_plots_for_datavzrd:
input:
plots="results/calling/graphs/rendered/{group}",
- report="results/datavzrd-report/all.fdr-controlled",
+ overview="results/calling/tables/{group}/{group}_overview.{category}.tsv",
+ report="results/datavzrd-report/{group}.fdr-controlled",
output:
- marker="results/tmp/{group}.graph_plots.marker",
+ marker="results/tmp/{group}.{category}.graph_plots.marker",
params:
output_dir=lambda wc: directory(
- f"results/datavzrd-report/all.fdr-controlled/circles-{wc.group}/graphs"
+ f"results/datavzrd-report/{wc.group}.fdr-controlled/graphs"
),
log:
- "logs/datavzrd/copy_graph_plots/{group}.log",
+ "logs/datavzrd/copy_graph_plots/{group}.{category}.log",
conda:
- "../envs/bash.yaml"
- shell:
- """
- mkdir -p {params.output_dir}
- for f in `find {input.plots} -regex '.*/graph_[0-9]+\.pdf'`; do ( cp "$f" {params.output_dir} ); done
- touch {output.marker}
- """
+ "../envs/pandas.yaml"
+ script:
+ "../scripts/copy_graph_plots.py"
rule datavzrd_circle_calls:
input:
- config="resources/datavzrd/all.datavzrd.yaml",
- calls=expand(
- "results/calling/tables/{group}.sorted.annotated.csv", group=GROUPS
+ config="results/datavzrd/{group}.datavzrd.yaml",
+ overview_tables=expand(
+ "results/calling/tables/{group}/{group}_overview.{category}.tsv",
+ category=CATEGORIES,
+ allow_missing=True,
),
+ detail_tables="results/calling/tables/{group}/{group}_details/",
output:
report(
- directory("results/datavzrd-report/all.fdr-controlled"),
+ directory("results/datavzrd-report/{group}.fdr-controlled"),
htmlindex="index.html",
category="Circle calls",
),
conda:
"../envs/datavzrd.yaml"
log:
- "logs/datavzrd_report/all.log",
+ "logs/datavzrd_report/{group}.log",
shell:
"datavzrd {input.config} --output {output} &> {log}"
diff --git a/workflow/rules/filter.smk b/workflow/rules/filter.smk
index 098e0b30..f9d81cd4 100644
--- a/workflow/rules/filter.smk
+++ b/workflow/rules/filter.smk
@@ -1,23 +1,37 @@
-rule filter_calls:
+rule filter_overview_table:
input:
- "results/calling/calls/annotated/{group}.bcf",
+ table="results/calling/tables/{group}/{group}_overview.tsv",
output:
- calls="results/calling/calls/filtered/{group}.bcf",
- by_exons=temp("results/calling/calls/filtered/{group}.at_least_one_exon.bcf"),
+ coding="results/calling/tables/{group}/{group}_overview.coding.tsv",
+ regulatory="results/calling/tables/{group}/{group}_overview.regulatory.tsv",
+ intronic="results/calling/tables/{group}/{group}_overview.intronic.tsv",
+ other="results/calling/tables/{group}/{group}_overview.other.tsv",
+ categorized="results/calling/tables/{group}/{group}_categorized_overview.tsv",
log:
- "logs/filter-calls/{group}.log",
+ "logs/filter_overview_table/{group}.log",
+ conda:
+ "../envs/pandas.yaml"
+ script:
+ "../scripts/filter_overview_table.py"
+
+
+rule filter_varlociraptor:
+ input:
+ calls="results/calling/calls/merged/{group}.bcf",
+ output:
+ fdr_calls="results/calling/calls/filtered_fdr/{group}.bcf",
+ log:
+ "logs/filter-varlociraptor/{group}.log",
benchmark:
- "benchmarks/filter-calls/{group}.txt"
+ "benchmarks/filter-varlociraptor/{group}.txt"
conda:
- "../envs/vembrane.yaml"
+ "../envs/varlociraptor.yaml"
threads: 1
params:
- fdr=config["calling"]["filter"]["fdr-control"].get("threshold", 0.05),
+ fdr=config["filter"]["fdr-control"].get("threshold", 0.05),
mode=varlociraptor_filtering_mode,
- filter_expression="True", # 'INFO["NUM_EXONS"] != 0',
shell:
"""
- vembrane filter '{params.filter_expression}' {input} -O bcf > {output.by_exons} 2> {log}
- varlociraptor filter-calls control-fdr {params.mode} --events PRESENT --var BND --fdr {params.fdr} {output.by_exons} | varlociraptor decode-phred | bcftools sort -m 4G -Ob > {output.calls} 2>> {log}
+ varlociraptor filter-calls control-fdr --mode {params.mode} --events PRESENT --var BND --fdr {params.fdr} {input.calls} | varlociraptor decode-phred | bcftools sort -m 4G -Ob > {output.fdr_calls} 2> {log}
"""
diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk
index bbefd76c..8fce0193 100644
--- a/workflow/rules/map.smk
+++ b/workflow/rules/map.smk
@@ -1,7 +1,6 @@
-
rule minimap2_bam:
input:
- target=f"results/calling/index/{REFERENCE}.mmi", # can be either genome index or genome fasta
+ target=rules.minimap2_index.output.index, # can be either genome index or genome fasta
query=get_minimap2_input,
output:
"results/calling/mapping/{sample}.bam",
@@ -12,27 +11,10 @@ rule minimap2_bam:
params:
extra=get_minimap2_mapping_params, # optional
sorting="coordinate", # optional: Enable sorting. Possible values: 'none', 'queryname' or 'coordinate'
- sort_extra=lambda wc: f"-@ {workflow.cores * 0.5}", # optional: extra arguments for samtools/picard
- threads: workflow.cores * 0.5
- wrapper:
- "v1.0.0/bio/minimap2/aligner"
-
-
-rule minimap2_index:
- input:
- target=config["calling"]["reference"]["path"],
- output:
- f"results/calling/index/{REFERENCE}.mmi",
- log:
- f"logs/minimap2_index/{REFERENCE}.log",
- benchmark:
- f"benchmarks/minimap2_index/{REFERENCE}.txt"
- params:
- extra="", # optional additional args
- cache: True
- threads: workflow.cores
+ sort_extra=lambda wc, threads: f"-@ {min(threads, 4)}", # optional: extra arguments for samtools/picard
+ threads: workflow.cores // 2
wrapper:
- "v1.0.0/bio/minimap2/index"
+ "v1.25.0/bio/minimap2/aligner"
rule merge_fastqs:
@@ -68,7 +50,7 @@ rule samtools_index:
# Samtools takes additional threads through its option -@
threads: 12 # This value - 1 will be sent to -@
wrapper:
- "v1.0.0/bio/samtools/index"
+ "v1.25.0/bio/samtools/index"
rule samtools_faidx:
@@ -85,4 +67,4 @@ rule samtools_faidx:
# Samtools takes additional threads through its option -@
threads: 12 # This value - 1 will be sent to -@
wrapper:
- "v1.0.0/bio/samtools/faidx"
+ "v1.25.0/bio/samtools/faidx"
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
new file mode 100644
index 00000000..752802db
--- /dev/null
+++ b/workflow/rules/ref.smk
@@ -0,0 +1,106 @@
+rule get_genome:
+ output:
+ genome=expand(
+ "resources/{species}.{build}.{release}.fasta",
+ species=config["reference"]["species"],
+ build=config["reference"]["build"],
+ release=config["reference"]["release"],
+ ),
+ log:
+ "logs/get-genome.log",
+ params:
+ species=config["reference"]["species"],
+ datatype="dna",
+ build=config["reference"]["build"],
+ release=config["reference"]["release"],
+ cache: "omit-software" # save space and time with between workflow caching (see docs)
+ wrapper:
+ "v1.25.0/bio/reference/ensembl-sequence"
+
+
+rule genome_faidx:
+ input:
+ rules.get_genome.output.genome,
+ output:
+ index=expand(
+ "{genome}.fai",
+ genome=rules.get_genome.output.genome,
+ ),
+ log:
+ "logs/genome-faidx.log",
+ cache: True
+ wrapper:
+ "v1.25.0/bio/samtools/faidx"
+
+
+rule minimap2_index:
+ input:
+ target=rules.get_genome.output.genome,
+ output:
+ index=expand(
+ "resources/{species}.{build}.{release}.mmi",
+ species=config["reference"]["species"],
+ build=config["reference"]["build"],
+ release=config["reference"]["release"],
+ ),
+ log:
+ "logs/minimap2_index/genome.log",
+ benchmark:
+ "benchmarks/minimap2_index/genome.txt"
+ params:
+ extra="", # optional additional args
+ cache: True
+ # Minimap2 uses at most three threads when indexing target sequences:
+ # https://lh3.github.io/minimap2/minimap2.html
+ threads: 3
+ wrapper:
+ "v1.25.0/bio/minimap2/index"
+
+
+# TODO: create new ENSEMBL-REGULATORY-ANNOTATION snakemake wrapper
+rule download_regulatory_annotation:
+ output:
+ "resources/regulatory_annotation.gff3.gz",
+ log:
+ "logs/download_regulatory_annotation.log",
+ params:
+ release=config["reference"].get("release", "107"),
+ benchmark:
+ "benchmarks/download_regulatory_annotation.txt"
+ cache: "omit-software" # save space and time with between workflow caching (see docs)
+ conda:
+ "../envs/wget.yaml"
+ shell:
+ """wget https://ftp.ensembl.org/pub/release-{params.release}/regulation/homo_sapiens/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20220201.gff.gz --no-check-certificate -O {output} 2> {log}"""
+
+
+rule download_repeatmasker_annotation:
+ output:
+ "resources/repeat_masker.fa.out.gz",
+ log:
+ "logs/download_repeatmasker_annotation.log",
+ params:
+ download_link=config["reference"].get("repeat_masker_download_link", ""),
+ benchmark:
+ "benchmarks/download_repeatmasker_annotation.txt"
+ cache: "omit-software" # save space and time with between workflow caching (see docs)
+ conda:
+ "../envs/wget.yaml"
+ shell:
+ """wget {params.download_link} --no-check-certificate -O {output} 2> {log}"""
+
+
+rule download_gene_annotation:
+ output:
+ "resources/gene_annotation.gff3.gz",
+ params:
+ species=config["reference"]["species"],
+ build=config["reference"]["build"],
+ release=config["reference"]["release"],
+ flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
+ branch="", # optional: specify branch
+ log:
+ "logs/download_gene_annotation.log",
+ cache: "omit-software" # save space and time with between workflow caching (see docs)
+ wrapper:
+ "v1.25.0/bio/reference/ensembl-annotation"
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
deleted file mode 100644
index 83897faa..00000000
--- a/workflow/rules/report.smk
+++ /dev/null
@@ -1,25 +0,0 @@
-
-rule csv_report:
- input:
- csv="results/calling/tables/{group}.sorted.annotated.csv",
- plots="results/calling/coverage_graphs/{group}",
- output:
- report(
- directory("results/calling/report/tables/{group}"),
- category="Circle calls",
- htmlindex="index.html",
- ),
- params:
- extra="--formatter resources/report-table-formatter.js",
- log:
- "logs/rbt-csv-report/{group}.log",
- benchmark:
- "benchmarks/rbt-csv-report/{group}.txt"
- conda:
- "../envs/rbt.yaml"
- shell:
- """
- mkdir -p {output}/qc_plots
- for f in {input.plots}/*.html; do ( cp "$f" {output}/qc_plots/ ); done
- rbt csv-report {input.csv} {output} {params.extra} &> {log}
- """
diff --git a/workflow/rules/table.smk b/workflow/rules/table.smk
deleted file mode 100644
index 846b4c04..00000000
--- a/workflow/rules/table.smk
+++ /dev/null
@@ -1,69 +0,0 @@
-from pathlib import Path
-
-
-rule annotate_breakpoint_sequence:
- input:
- csv="results/calling/tables/{group}.sorted.csv",
- reference=config["calling"]["reference"]["path"],
- output:
- csv="results/calling/tables/{group}.sorted.annotated.csv",
- log:
- "logs/sort_and_tidy_tsv/{group}.log",
- benchmark:
- "benchmarks/sort_and_tidy_tsv/{group}.txt"
- threads: 8
- conda:
- "../envs/table.yaml"
- script:
- "../scripts/annotate_breakpoint_sequence.py"
-
-
-rule sort_and_tidy_tsv:
- input:
- tsv="results/calling/tables/{group}.tsv",
- output:
- csv="results/calling/tables/{group}.sorted.csv",
- log:
- "logs/sort_and_tidy_tsv/{group}.log",
- benchmark:
- "benchmarks/sort_and_tidy_tsv/{group}.txt"
- conda:
- "../envs/table.yaml"
- script:
- "../scripts/sort_and_tidy_tsv.py"
-
-
-rule vembrane_table:
- input:
- vcf="results/calling/calls/filtered/{group}.bcf",
- output:
- tsv="results/calling/tables/{group}.tsv",
- threads: 1
- params:
- expression=", ".join(
- [
- 'INFO["EVENT"]',
- "ID",
- "CHROM",
- "POS",
- "ALT",
- 'INFO["CircleLength"]',
- 'INFO["CircleSegmentCount"]',
- 'INFO["SplitReads"]',
- 'INFO["NUM_EXONS"]',
- '";".join(INFO["GENES"] or [])',
- 'for_each_sample(lambda s: FORMAT["AF"][s])',
- 'INFO["PROB_PRESENT"]',
- 'INFO["PROB_ABSENT"]',
- 'INFO["PROB_ARTIFACT"]',
- ]
- ),
- extra=lambda wc: "--header \"EVENT, ID, CHROM, POS, ALT, circle_length, num_segments, split_reads, NUM_EXONS, GENES, for_each_sample(lambda s: 'AF_' + s), PROB_PRESENT, PROB_ABSENT, PROB_ARTIFACT\"",
- log:
- "logs/vembrane_table/{group}.log",
- benchmark:
- "benchmarks/vembrane_table/{group}.txt"
- conda:
- "../envs/vembrane.yaml"
- shell:
- """vembrane table {params.extra} '{params.expression}' {input.vcf} > {output.tsv}"""
diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
deleted file mode 100644
index 422575e8..00000000
--- a/workflow/rules/utils.smk
+++ /dev/null
@@ -1,25 +0,0 @@
-rule get_genome:
- output:
- "resources/genome.fasta",
- log:
- "logs/get-genome.log",
- params:
- species=config["calling"]["reference"]["species"],
- datatype="dna",
- build=config["calling"]["reference"]["build"],
- release=config["calling"]["reference"]["release"],
- cache: True
- wrapper:
- "v1.0.0/bio/reference/ensembl-sequence"
-
-
-rule genome_faidx:
- input:
- f"resources/{config['calling']['reference']['name']}.fasta",
- output:
- f"resources/{config['calling']['reference']['name']}.fasta.fai",
- log:
- "logs/genome-faidx.log",
- cache: True
- wrapper:
- "v1.0.0/bio/samtools/faidx"
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 9832ff1d..cd6dd73c 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -5,10 +5,6 @@ description: snakemake configuration file
type: object
definitions:
- filterentry:
- type: object
- additionalProperties:
- type: string
evententry:
type: object
properties:
@@ -25,32 +21,26 @@ definitions:
properties:
- calling:
+ samples:
+ type: string
+ units:
+ type: string
+ reference:
type: object
properties:
- samples:
+ species:
+ type: string
+ release:
+ type: integer
+ build:
type: string
- reference:
- type: object
- properties:
- name:
- type: string
- path:
- type: string
- n_chromosomes:
- type: integer
- species:
- type: string
- release:
- type: integer
- build:
- type: string
- required:
- - name
- - species
- - release
- - build
- - n_chromosomes
+ required:
+ - species
+ - release
+ - build
+ cyrcular:
+ type: object
+ properties:
min_read_depth:
type: number
default: 2
@@ -67,26 +57,30 @@ properties:
type: number
default: 10000
minimum: 1
- filter:
- fdr-control:
- type: object
- properties:
- threshold:
- type: number
- minimum: 0.0
- maximum: 1.0
- local:
- type: boolean
- events:
- $ref: "#/definitions/evententry"
- description: "a map of pairs"
- required:
- - threshold
- - events
required:
- - samples
- - reference
- - filter
-
+ - min_read_depth
+ - min_split_reads
+ - max_paths_per_component
+ - max_deletion_length
+ filter:
+ fdr-control:
+ type: object
+ properties:
+ threshold:
+ type: number
+ minimum: 0.0
+ maximum: 1.0
+ local:
+ type: boolean
+ events:
+ $ref: "#/definitions/evententry"
+ description: "a map of pairs"
+ required:
+ - threshold
+ - events
required:
- - calling
\ No newline at end of file
+ - samples
+ - units
+ - reference
+ - cyrcular
+ - filter
diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml
index 06496e56..e3a15f87 100644
--- a/workflow/schemas/samples.schema.yaml
+++ b/workflow/schemas/samples.schema.yaml
@@ -1,4 +1,4 @@
-$schema: "http://json-schema.org/draft-04/schema#"
+$schema: "http://json-schema.org/draft-07/schema#"
description: an entry in the sample sheet
properties:
@@ -6,7 +6,7 @@ properties:
type: string
description: group name/identifier (alphanumeric string, that may additionally contain '_' and '-'). One sample can consist of multiple units.
pattern: "^[a-zA-Z_0-9-]+$"
- sample:
+ sample_name:
type: string
description: sample name/identifier (alphanumeric string, that may additionally contain '_' and '-')
pattern: "^[a-zA-Z_0-9-]+$"
@@ -20,5 +20,5 @@ properties:
required:
- group
- - sample
- - platform
\ No newline at end of file
+ - sample_name
+ - platform
diff --git a/workflow/schemas/units.schema.yaml b/workflow/schemas/units.schema.yaml
index 660e0ede..37a08c58 100644
--- a/workflow/schemas/units.schema.yaml
+++ b/workflow/schemas/units.schema.yaml
@@ -1,13 +1,13 @@
-$schema: "http://json-schema.org/draft-04/schema#"
+$schema: "http://json-schema.org/draft-07/schema#"
description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data
type: object
properties:
- sample:
+ sample_name:
type: string
pattern: "^[a-zA-Z_0-9-]+$"
description: sample name/id the unit has been sequenced from (alphanumeric string, that may additionally contain '_' and '-')
- unit:
+ unit_name:
type: string
pattern: "^[a-zA-Z_0-9-]+$"
description: unit id (alphanumeric string, that may additionally contain '_' and '-')
@@ -20,6 +20,6 @@ properties:
description: path to second FASTQ file (leave empty in case of single-end)
required:
- - sample
- - unit
- - fq1
\ No newline at end of file
+ - sample_name
+ - unit_name
+ - fq1
diff --git a/workflow/scripts/annotate_breakpoint_sequence.py b/workflow/scripts/annotate_breakpoint_sequence.py
deleted file mode 100644
index edfd1cb7..00000000
--- a/workflow/scripts/annotate_breakpoint_sequence.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pandas as pd
-from pandarallel import pandarallel
-from dinopy import FastaReader
-
-pandarallel.initialize(nb_workers=snakemake.threads, progress_bar=True)
-
-
-def sort_table(df: pd.DataFrame):
- ordered = (
- df.groupby(["EVENT"])[["PROB_PRESENT", "length", "split_reads"]]
- .max()
- .sort_values(["PROB_PRESENT", "length", "split_reads"], ascending=False)
- .index
- )
- categories = list(ordered)
- df["EVENT"] = pd.Categorical(df["EVENT"], categories=categories)
- df.sort_values("EVENT", inplace=True)
- return df
-
-
-def main(snakemake):
- df = pd.read_csv(snakemake.input.csv)
- if df.empty:
- df["breakpoint_seq"] = None
- df.to_csv(snakemake.output.csv, index=False, sep=",")
- exit(0)
-
- far = FastaReader(snakemake.input.reference, write_fai=True)
-
- # TODO: needs dir information as well.
- def get_seq(chrom1, start, chrom2, stop, direction, up_down=1000):
- if direction == "from":
- chrom_to, pos_to = chrom1, start - 1
- chrom_from, pos_from = chrom2, stop - 1
- elif direction == "to":
- chrom_to, pos_to = chrom2, stop - 1
- chrom_from, pos_from = chrom1, start - 1
- else:
- print("foo")
- raise ValueError("unknown direction")
-
- entry_from = next(far[chrom_from])
- entry_to = next(far[chrom_to])
- chrom_from_len = entry_from.length
- chrom_to_len = entry_to.length
-
- from_start = max(0, pos_from - up_down)
- from_stop = pos_from
-
- to_start = pos_to
- to_stop = min(pos_to + up_down, chrom_to_len - 1)
-
- seq_from = entry_from.sequence[from_start:from_stop].decode()
- seq_to = entry_to.sequence[to_start:to_stop].decode()
- return seq_from + seq_to
-
- df["breakpoint_seq"] = df.parallel_apply(
- lambda row: get_seq(
- row["CHROM"],
- row["start"],
- row["OTHER_CHROM"],
- row["stop"],
- row["direction"],
- ),
- axis=1,
- )
- df = sort_table(df)
- df.to_csv(snakemake.output.csv, index=False, sep=",")
-
-
-if __name__ == "__main__":
- main(snakemake)
diff --git a/workflow/scripts/annotate_breakpoint_sequence.rs b/workflow/scripts/annotate_breakpoint_sequence.rs
deleted file mode 100644
index e8ac9e09..00000000
--- a/workflow/scripts/annotate_breakpoint_sequence.rs
+++ /dev/null
@@ -1,21 +0,0 @@
-//! This is a regular crate doc comment, but it also contains a partial
-//! Cargo manifest. Note the use of a *fenced* code block, and the
-//! `cargo` "language".
-//!
-//! ```cargo
-//! [dependencies]
-//! csv = "1.1"
-//! rust-bio = "*"
-//! ```
-
-
-use anyhow::Result;
-
-fn main() -> Result<()> {
- let _stdout_redirect = snakemake.redirect_stdout(snakemake.log.stdout)?;
- println!("This will be written to path/to/stdout.log");
-
- // redirect stderr to "path/to/stderr.log"
- let _stderr_redirect = snakemake.redirect_stderr(snakemake.log.stderr)?;
- Ok(())
-}
\ No newline at end of file
diff --git a/workflow/scripts/circle_qc_plot_link_formatter.js b/workflow/scripts/circle_qc_plot_link_formatter.js
new file mode 100644
index 00000000..efd1cb44
--- /dev/null
+++ b/workflow/scripts/circle_qc_plot_link_formatter.js
@@ -0,0 +1,6 @@
+function circle_qc_plot_link_formatter(value, row) {
+ let graph_id = row["graph_id"]
+ let circle_id = row["circle_id"]
+ let link = `${value}`;
+ return link;
+}
diff --git a/workflow/scripts/copy_graph_plots.py b/workflow/scripts/copy_graph_plots.py
new file mode 100644
index 00000000..8151438c
--- /dev/null
+++ b/workflow/scripts/copy_graph_plots.py
@@ -0,0 +1,17 @@
+from contextlib import redirect_stderr
+
+with open(snakemake.log[0], "w") as logfile:
+ with redirect_stderr(logfile):
+ import os
+ import shutil
+ import pandas as pd
+ from pathlib import Path
+
+ os.makedirs(snakemake.params.output_dir, exist_ok=True)
+ overview = pd.read_csv(snakemake.input.overview, sep="\t")
+ graph_ids = set(overview["graph_id"])
+ for graph_id in graph_ids:
+ shutil.copy(
+ f"{snakemake.input.plots}/graph_{graph_id}.pdf", snakemake.params.output_dir
+ )
+ Path(snakemake.output.marker).touch()
diff --git a/workflow/scripts/copy_qc_plots.py b/workflow/scripts/copy_qc_plots.py
new file mode 100644
index 00000000..9ea74325
--- /dev/null
+++ b/workflow/scripts/copy_qc_plots.py
@@ -0,0 +1,18 @@
+from contextlib import redirect_stderr
+
+with open(snakemake.log[0], "w") as logfile:
+ with redirect_stderr(logfile):
+ import os
+ import shutil
+ import pandas as pd
+ from pathlib import Path
+
+ os.makedirs(snakemake.params.output_dir, exist_ok=True)
+ overview = pd.read_csv(snakemake.input.overview, sep="\t")
+ event_ids = {s.replace("-", "_") for s in overview["event_id"]}
+ for event_id in event_ids:
+ shutil.copy(
+ f"{snakemake.input.plots}/graph_{event_id}.html",
+ snakemake.params.output_dir,
+ )
+ Path(snakemake.output.marker).touch()
diff --git a/workflow/scripts/filter_overview_table.py b/workflow/scripts/filter_overview_table.py
new file mode 100644
index 00000000..83f2faf0
--- /dev/null
+++ b/workflow/scripts/filter_overview_table.py
@@ -0,0 +1,24 @@
+from contextlib import redirect_stderr
+
+with open(snakemake.log[0], "w") as logfile:
+ with redirect_stderr(logfile):
+ import pandas as pd
+
+ df = pd.read_csv(snakemake.input.table, sep="\t")
+ df.loc[:, ["gene_names", "gene_ids", "regulatory_features"]] = df.loc[
+ :, ["gene_names", "gene_ids", "regulatory_features"]
+ ].fillna("")
+ df["category"] = df[["num_exons", "regulatory_features", "gene_names"]].apply(
+ lambda r: "coding"
+ if r["num_exons"] > 0
+ else (
+ "regulatory"
+ if r["regulatory_features"]
+ else ("intronic" if r["gene_names"] else "other")
+ ),
+ axis=1,
+ )
+ for kind in ["coding", "regulatory", "intronic", "other"]:
+ part = df.query(f"category == '{kind}'")
+ part.to_csv(getattr(snakemake.output, kind), sep="\t", index=False)
+ df.to_csv(snakemake.output.categorized, sep="\t", index=False)
diff --git a/workflow/scripts/gene_card_link_formatter.js b/workflow/scripts/gene_card_link_formatter.js
new file mode 100644
index 00000000..5887acde
--- /dev/null
+++ b/workflow/scripts/gene_card_link_formatter.js
@@ -0,0 +1,15 @@
+function gene_card_link_formatter(value, row) {
+ $(function () {
+ $('[data-toggle="tooltip"]').tooltip()
+ });
+ let genes = value.split(",");
+ let rows = "";
+ for (let g of genes) {
+ rows = `${rows}${g} |
`;
+ };
+ let table = ``;
+ let button = ``;
+ return button;
+}
diff --git a/workflow/scripts/gff_annotate.py b/workflow/scripts/gff_annotate.py
deleted file mode 100644
index a17397ef..00000000
--- a/workflow/scripts/gff_annotate.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from typing import Tuple
-
-from pysam import (
- VariantFile,
- TabixFile,
- asGFF3,
- VariantRecord,
-)
-import re
-
-
-def parse_attribute(s: str) -> Tuple[str, str]:
- t = s.split("=", maxsplit=1)
- return t[0], t[1]
-
-
-BND_RE = re.compile(r""".*([]\[])((?P.+):(?P[0-9]+))([]\[])?.*""")
-
-
-def parse_bnd_alt(s: str) -> Tuple[str, int]:
- return BND_RE.search(s)["seqname"], int(BND_RE.search(s)["position"])
-
-
-gff_in: TabixFile = TabixFile(snakemake.input["annotation"], parser=asGFF3())
-
-with VariantFile(snakemake.input["variants"]) as bcf_in:
- header = bcf_in.header
- info_genes = r"""##INFO="""
- header.add_line(info_genes)
- info_exons = r"""##INFO="""
- header.add_line(info_exons)
-
- with VariantFile(snakemake.output["variants"], "w", header=header) as bcf_out:
- for record in bcf_in:
- record: VariantRecord = record
- start, end = record.start, record.stop
- if "BND" in record.info["SVTYPE"]:
- mate_chrom, mate_pos = parse_bnd_alt(record.alts[0])
- if mate_chrom == record.chrom:
- end = mate_pos
- start, end = min(start, end), max(start, end)
- num_exons = 0
- try:
- genes = set()
- exons = set()
- try:
- for annot in gff_in.fetch(f"chr{record.chrom}", start, end):
- attributes = dict(map(parse_attribute, annot.attributes.split(";")))
- genes.add(attributes["gene_name"])
- if "exon_number" in attributes and (
- annot.start >= start and annot.end <= end
- ):
- exons.add(attributes["exon_number"])
- except ValueError:
- for annot in gff_in.fetch(f"{record.chrom}", start, end):
- attributes = dict(map(parse_attribute, annot.attributes.split(";")))
- genes.add(attributes["gene_name"])
- if "exon_number" in attributes and (
- annot.start >= start and annot.end <= end
- ):
- exons.add(attributes["exon_number"])
- if len(genes) > 0:
- record.info["GENES"] = list(genes)
- record.info["NUM_EXONS"] = len(set(exons))
- except ValueError:
- # Usually means: no annotations for a specific region
- pass
- finally:
- bcf_out.write(record)
diff --git a/workflow/scripts/graph_link_formatter.js b/workflow/scripts/graph_link_formatter.js
new file mode 100644
index 00000000..07c1d955
--- /dev/null
+++ b/workflow/scripts/graph_link_formatter.js
@@ -0,0 +1,5 @@
+function graph_link_formatter(value, row) {
+ let graph_id = value;
+ let link = `${value}`;
+ return link;
+}
diff --git a/workflow/scripts/sort_and_tidy_tsv.py b/workflow/scripts/sort_and_tidy_tsv.py
deleted file mode 100644
index e5fd05c6..00000000
--- a/workflow/scripts/sort_and_tidy_tsv.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pandas as pd
-from math import ceil
-import re
-
-
-BND_RE = re.compile(r""".*([]\[])((?P.+):(?P[0-9]+))([]\[])?.*""")
-
-
-def parse_bnd_alt(s: str):
- return BND_RE.search(s)["seqname"], int(BND_RE.search(s)["position"])
-
-
-def sort_table(df: pd.DataFrame):
- ordered = (
- df.groupby(["EVENT"])[["PROB_PRESENT", "length", "split_reads"]]
- .max()
- .sort_values(["PROB_PRESENT", "length", "split_reads"], ascending=False)
- .index
- )
- categories = list(ordered)
- df["EVENT"] = pd.Categorical(df["EVENT"], categories=categories)
- df.sort_values("EVENT", inplace=True)
- return df
-
-
-def main(snakemake):
- df = pd.read_csv(snakemake.input.tsv, sep="\t")
- columns = [
- "graph",
- "EVENT",
- "CHROM",
- "start",
- "OTHER_CHROM",
- "stop",
- "ALT",
- "direction",
- "circle_length",
- "num_segments",
- "split_reads",
- "AF_nanopore",
- "PROB_PRESENT",
- "PROB_ABSENT",
- "PROB_ARTIFACT",
- "length",
- "NUM_EXONS",
- "GENES",
- ]
- # if there are no entries, make sure an empty table with the correct header is written anyways
- if df.empty:
- df = pd.DataFrame(columns=columns)
- df.to_csv(snakemake.output.csv, index=False)
- return
-
- # the list of genes may be too long for excel spreadsheets, hence we split the column into multiple ones
- # from https://stackoverflow.com/questions/58645152/splitting-a-long-string-in-pandas-cell-near-the-n-th-character-position-into-mul
- limit = 32767 - 1
- sep = ";"
- longest = df["GENES"].apply(lambda s: len(s) if isinstance(s, str) else 0).max()
- num_seps = (
- df["GENES"].apply(lambda s: s.count(sep) if isinstance(s, str) else 0).max()
- )
- num_cols = max(1, ceil(longest / (limit - num_seps)))
-
- for index, row in df.iterrows():
- gene_names = row["GENES"]
- num_chars = len(gene_names) if isinstance(gene_names, str) else 0
- if num_chars == 0:
- continue
- genes = gene_names.split(sep)
- num_genes = len(genes)
- col_index = ceil(len(genes) / num_cols)
-
- for _ in range(num_cols - 1):
- genes.insert(col_index, "|")
- col_index += col_index
- new = sep.join(genes)
- df.at[index, "GENES"] = new
-
- df[["OTHER_CHROM", "stop"]] = pd.DataFrame(
- df["ALT"].apply(parse_bnd_alt).tolist(), index=df.index
- )
- df["direction"] = df["ALT"].apply(
- lambda a: "to" if "[" in a else ("from" if "]" in a else "*")
- )
- df.rename(columns=dict(POS="start"), inplace=True)
-
- df["length"] = df["circle_length"]
- df.query("CHROM != OTHER_CHROM")["length"] = float("nan")
- df.drop(columns=["ID"], inplace=True)
-
- new_cols = [f"GENES_{i + 1}" for i in range(num_cols)]
- df[new_cols] = df["GENES"].astype(dtype=str).str.split("|", expand=True)
- df[new_cols] = df[new_cols].apply(lambda s: s.str.strip(sep))
- del df["GENES"]
- df.rename(columns=dict(GENES_1="GENES"), inplace=True)
- gene_cols = [c for c in df.columns if c.startswith("GENES")]
-
- df = sort_table(df)
-
- df.drop_duplicates(
- subset=["EVENT", "CHROM", "start", "OTHER_CHROM", "stop"], inplace=True
- )
-
- df["graph"] = [re.sub(r"_circle_.*", "", d) for d in df["EVENT"]]
-
- # reorder columns
- df = df[columns[:-1] + gene_cols]
-
- df.to_csv(snakemake.output.csv, index=False, sep=",")
-
-
-if __name__ == "__main__":
- main(snakemake)
diff --git a/workflow/scripts/sort_bcf_header.py b/workflow/scripts/sort_bcf_header.py
new file mode 100644
index 00000000..72bdff07
--- /dev/null
+++ b/workflow/scripts/sort_bcf_header.py
@@ -0,0 +1,40 @@
+with open(snakemake.input.header, "rt") as header_file:
+ header = [l.strip() for l in header_file.readlines()]
+ file_format_line = header[0]
+ chrom_line = header[-1]
+ other_lines = header[1:-1]
+ kinds = [
+ "fileDate",
+ "source",
+ "reference",
+ "contig",
+ "phasing",
+ "FILTER",
+ "INFO",
+ "FORMAT",
+ "ALT",
+ "assembly",
+ "META",
+ "SAMPLE",
+ "PEDIGREE",
+ "pedigreeDB",
+ ]
+ categories = {kind: [] for kind in kinds}
+ others = []
+ for line in other_lines:
+ if "=" in line:
+ kind = line.split("=")[0].lstrip("#")
+ group = categories.get(kind, others)
+ else:
+ group = others
+ group.append(line)
+
+ with open(snakemake.output.sorted_header, "wt") as out:
+ print(file_format_line, file=out)
+ for kind in kinds:
+ lines = categories[kind]
+ for line in lines:
+ print(line, file=out)
+ for line in others:
+ print(line, file=out)
+ print(chrom_line, file=out)
diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py
deleted file mode 100644
index dec56efa..00000000
--- a/workflow/scripts/tsv_to_xlsx.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import sys
-import pandas as pd
-
-sys.stderr = open(snakemake.log[0], "w")
-
-data = pd.read_csv(snakemake.input.tsv, sep="\t")
-data.to_excel(snakemake.output.xlsx, index=False)