diff --git a/.github/workflows/conventional-prs.yaml b/.github/workflows/conventional-prs.yaml new file mode 100644 index 00000000..210988fa --- /dev/null +++ b/.github/workflows/conventional-prs.yaml @@ -0,0 +1,18 @@ +name: "Lint PR for conventional commits: https://www.conventionalcommits.org" + +on: + pull_request_target: + types: + - opened + - reopened + - edited + - synchronize + +jobs: + main: + name: Validate PR title + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 00000000..e4614386 --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,58 @@ +name: Tests + +on: + push: + branches: [ main ] + pull_request: + branches_ignore: [] + + +jobs: + Formatting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + # Full git history is needed to get a proper + # list of changed files within `super-linter` + fetch-depth: 0 + - name: Formatting + uses: github/super-linter@v4 + env: + VALIDATE_ALL_CODEBASE: false + DEFAULT_BRANCH: main + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VALIDATE_SNAKEMAKE_SNAKEFMT: true + + Linting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Lint workflow + uses: snakemake/snakemake-github-action@v1 + with: + directory: . + snakefile: workflow/Snakefile + stagein: "mamba install -y -n snakemake --channel conda-forge pyarrow=6.0" + args: "--lint" + + Testing: + runs-on: ubuntu-latest + needs: + - Linting + - Formatting + steps: + - uses: actions/checkout@v3 + - name: Test workflow + uses: snakemake/snakemake-github-action@v1 + with: + directory: .test + snakefile: workflow/Snakefile + args: "--configfile .test/config/config.yaml --use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache" + + - name: Test report + uses: snakemake/snakemake-github-action@v1 + with: + directory: .test + snakefile: workflow/Snakefile + args: "--configfile .test/config/config.yaml --report report.zip" diff --git a/.github/workflows/release-please.yaml b/.github/workflows/release-please.yaml new file mode 100644 index 00000000..3dd2575f --- /dev/null +++ b/.github/workflows/release-please.yaml @@ -0,0 +1,15 @@ +name: "release-please, see: https://github.com/marketplace/actions/release-please-action" + +on: + push: + branches: + - main + +jobs: + release-please: + runs-on: ubuntu-latest + steps: + - uses: google-github-actions/release-please-action@v3 + with: + release-type: simple + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..c123a441 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +.test/benchmarks/** +.test/logs/** +.test/resources/*.fai +.test/resources/*.gz +.test/resources/*.mmi +.test/results/** +.test/.snakemake/** +benchmarks/** +logs/** +resources/** +!resources/datavzrd/** +!resources/summary_plot.json +results/** +.snakemake/** \ No newline at end of file diff --git a/.test/config/config.yaml b/.test/config/config.yaml new file mode 100644 index 00000000..3e72af79 --- /dev/null +++ b/.test/config/config.yaml @@ -0,0 +1,40 @@ +samples: config/samples.tsv +units: config/units.tsv + +reference: + # NOTE: ANY CHANGE OF SPECIES, RELEASE OR BUILD IN THIS TESTING + # CONFIG.YAML REQUIRES ADJUSTING THE FILENAME OF RESOURCE/*.FASTA + # Ensembl species name + species: homo_sapiens + # Genome build + build: GRCh38 + # Ensembl release + release: 107 + # for available downloads, please browse either of these views: + # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasets.html + # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasetsAlt.html + repeat_masker_download_link: "http://www.repeatmasker.org/genomes/hg38/RepeatMasker-rm406-dfam2.0/hg38.fa.out.gz" + + +cyrcular: + # minimum read depth (used during covered segment identification) + min_read_depth: 2 + # minimum number of split reads (edges with less than this number will be removed from the graph) + min_split_reads: 5 + # maximum number of plausible circular paths generated for each strongly connected component of the graph + max_paths_per_component: 15 + # maximum deletion length which is encoded as an edge / a breakend event. Can be useful for unmapped regions in the reference. + max_deletion_length: 10000 + +filter: + # varlociraptor fdr control + fdr-control: + threshold: 0.1 + mode: local-smart + events: + circular: + varlociraptor: + - present + circles: + min-length: 100 + max-length: 50_000_000 diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv new file mode 100644 index 00000000..675ffa0c --- /dev/null +++ b/.test/config/samples.tsv @@ -0,0 +1,2 @@ +group sample_name platform +example example nanopore diff --git a/.test/config/units.tsv b/.test/config/units.tsv new file mode 100644 index 00000000..0d68373b --- /dev/null +++ b/.test/config/units.tsv @@ -0,0 +1,2 @@ +sample_name unit_name fq1 fq2 +example 0 data/example_reads.fastq.gz diff --git a/.test/data/example_reads.fastq.gz b/.test/data/example_reads.fastq.gz new file mode 100644 index 00000000..cc1f575a Binary files /dev/null and b/.test/data/example_reads.fastq.gz differ diff --git a/resources/example_ref.fasta b/.test/resources/homo_sapiens.GRCh38.107.fasta similarity index 71% rename from resources/example_ref.fasta rename to .test/resources/homo_sapiens.GRCh38.107.fasta index 6476a91a..d42c3915 100644 --- a/resources/example_ref.fasta +++ b/.test/resources/homo_sapiens.GRCh38.107.fasta @@ -1,4 +1,113 @@ ->M +>2 +AGGCTGTGACAGTCATCTGTCTGGACGCGCTGGGTGGATGCGGGGGGCTCCTGGGAACTG +TGTTGGAGCCGAGCAAGCGCTAGCCAGGCGCAAGCGCGCACAGACTGTAGCCATCCGAGG +ACACCCCCGCCCCCCCGGCCCACCCGGAGACACCCGCGCAGAATCGCCTCCGGATCCCCT +GCAGTCGGCGGGAGGTAAGGAGCAGGGCTTGCAAACCGCCCGGCGCCCAGGGAAGCGACG +AGCGCCGGGGCAAGGCAAGCCCTGGACGGGATTGCGACGTGCGCACCGGGCGCCCTAATA +TGCCCGGGGGACTGTTTCTGCTTCCGAAACAAAACCATCTCTGGGTTTTCCCAGAAAAGC +CAGTTCCAGCCCCGAAGGCATCCTGGCTAGAGGAGACCCGCCCTAATCCTTTTGCAGCCC +TTACCGGGGGGAGTAATGGCTTCTGCGAAAAGAAATTCCCTCGGCTCTAGAAGATCTGTC +TGTGTTTGAGCTGTCGGAGAGCCGGTGCGTCCCCACCCCAGGCTGGGGTTCTTCTCCAAA +GGGTGCCCCTGGAGGAAGAAGAGGGGGGGATTAGGCAGGGCGAGGCCGCCGCGGTCGCAA +TCTGGGTCACGGCTGCTCCAGCTTGGAGGAGAGGCGGCTCTCCCGGCGACCCTCCTCGCG +CGGGCGCCCCTGCCATTCCCGGGAACAGGGGCTCAGCCTCTCCCTCCCTGGAAGAGGACG +TTGTCGTGGGTTTGGAAGAGCAGGGGTGGGCTTAGAGAGCTTCCAATTAAGCTATTGGCA +GGAGTATCCCTGCAGCGGGTGAATGCCGAGGGGCGTTTGCTCAAATTTGGGGAGGGGAAG +GATTTGTGGATATGGGTGTCTGTTGTTGGTCTCTGTCTAGAGAAAGGCTTTTTTTTATTT +GCAAAGTTTTCTAAATCCCCTGCTATCATTTGCACTCCTGAGGTTGCATTTTTACAAAGG +GGGTAGAAGGTACTCCAAATACCATTCCCGGTAGCTGGGTCGGAGAGCCTGGGGCTTCCC +CTGAGCAGCCGGCCCCACACCGCTGCGAGTGCGGTTGTCTGCGTGCTCGTGAGAGCTAGA +ATTCTGCAGCCAGGAACAGCCCCCTCCCCCAGGCAGTGCCTTGTGTGAATGAAATGGCAG +TTTCCAAAGTTGCGGAGCCTCGCCACCACCCCCTGCATCTGCATGCCCCCTCCCACCCCC +TGTCGTAGACAGCTTGTACACAAAAGGAGGGCGGGAGGGAGGGAGCGAGAGGCACAACTT +CCTCCACCTTCGGGAGCAGTGGGCAGAGTGGGGGGCTTGGAGGGAAGATTGGGGAACCTG +GTTAGAGGGGGCGCCCATTGCCTATCCCCTCGGTCTGCCCCGTTTGCCCACCCTCTCCGG +TGTGTCTGTCGGTTGCAGTGTTGGAGGTCGGCGCCGGCCCCCGCCTTCCGCGCCCCCCAC +GGGAAGGAAGCACCCCCGGTATTAAAACGAACGGGGCGGAAAGAAGCCCTCAGTCGCCGG +CCGGGAGGCGAGCCGATGCCGAGCTGCTCCACGTCCACCATGCCGGGCATGATCTGCAAG +AACCCAGACCTCGAGTTTGACTCGCTACAGCCCTGCTTCTACCCGGACGAAGATGACTTC +TACTTCGGCGGCCCCGACTCGACCCCCCCGGGGGAGGACATCTGGAAGAAGTTTGAGCTG +CTGCCCACGCCCCCGCTGTCGCCCAGCCGTGGCTTCGCGGAGCACAGCTCCGAGCCCCCG +AGCTGGGTCACGGAGATGCTGCTTGAGAACGAGCTGTGGGGCAGCCCGGCCGAGGAGGAC +GCGTTCGGCCTGGGGGGACTGGGTGGCCTCACCCCCAACCCGGTCATCCTCCAGGACTGC +ATGTGGAGCGGCTTCTCCGCCCGCGAGAAGCTGGAGCGCGCCGTGAGCGAGAAGCTGCAG +CACGGCCGCGGGCCGCCAACCGCCGGTTCCACCGCCCAGTCCCCGGGAGCCGGCGCCGCC +AGCCCTGCGGGTCGCGGGCACGGCGGGGCTGCGGGAGCCGGCCGCGCCGGGGCCGCCCTG +CCCGCCGAGCTCGCCCACCCGGCCGCCGAGTGCGTGGATCCCGCCGTGGTCTTCCCCTTT +CCCGTGAACAAGCGCGAGCCAGCGCCCGTGCCCGCAGCCCCGGCCAGTGCCCCGGCGGCG +GGCCCTGCGGTCGCCTCGGGGGCGGGTATTGCCGCCCCAGCCGGGGCCCCGGGGGTCGCC +CCTCCGCGCCCAGGCGGCCGCCAGACCAGCGGCGGCGACCACAAGGCCCTCAGTACCTCC +GGAGAGGACACCCTGAGCGATTCAGGTAAAGACCGAACTCGGGTCCGGCTGCCTCCCTGG +GGCACTGGACCCCGGGTCGCGTCCCCTTTGTTAGTGCTCGTATGTCTTGGCCTGGGGAGC +ATTTTGGAGGCAGTGCTAGGGGCAGAGAGGTCCTGTTTCCCCCAAGTCTCTCCTCGGGGT +AAAGAGAAGGGGCTGAGAGAATGCCGTTGCAAAAGGGGTGCTCTCCAATTCTCGCCTTCA +CTAAAGTTCCTTCCACCCTCTCCTGGGGAGCCCTCCTCTAGGCCATCACGGGCCCTCACC +CGGTCCCCCACCTCTCTTTTGCAGCGCAGTCTGAGGAATAAAATTGGAGAAAGTTGGTGG +CTAAACCGGGTGGGGGTTTAGGGGGTTGCTGGGTGCACTGCCTGGACAGAAACCTGTTAG +CGCAGGGGTGAAAGGGACTCTCTGGCCCAGGTCAGGGGAGGGAAAGACATCCCGAGAAGA +TTCAAGGGCTGTGCAAAGCCCTGTTTAAGGCGCAGGAACTTATAGGAGGGTTGCACAGAT +GGCTAGAGCCGATTTTCTATTCTTTTTCTTTTTCTTTTTTTTTTTTTTTCAAATGTCGGT +ACCTTTCCCTTCCCCCATCCTCGGTGGGTGGTGGGCTATTTGCTCCTGGTGCGTGGCCAG +CAGGCGGCGATATGCGAGGCCAGCAGGCGGGCCCGGGATCTGAAAGGCTGGGGGTGGTGG +GGGCACCCTCCCTCCCTCCATTCAGCAGCTGGCTGCAAGTGCAACAGCAGTTGTGTACAT +TCTCAGGGGGCCTCCTCTTTCCAGTGTGCAGTGGAAACTGGCTGTAGTTTTGTCTTCCAG +CCTGAATTCCAGGCCTAATTTGAGATGTGAGTTGTATCTGTAACCCAGTGCCCTTGAAGG +TGAGGGCAGGCACTCAGCAGCCTCTCCAGGAAGGCTCACATCCTGGGAGGACTCACTGAT +TAGTTCTATTGTGTTCATTTGTCTGTGTCTTAAGCTGAAGGGAAGAGTTAAAACCAAGCC +TTTCCCTGGGGGTCTGGATGAACAGAACTCAACCCAAAGAGTGGCATTGCCTTGTCCTTG +GAGCAGGGAGCTGGGACCCCCCTTGGACTTTGAAAACCAGTGTTTTCAGAATGCAGGTGG +ATAACAAGCCTAAATTTACTTCTGGGCTGAGGAGAGATCTTTGAGGCTCCTGGAAGGAAA +CTTGGTGATAAGCCTCCAGTTTGAAACGGCTCTGTCCCTTTAATGTCTGTGCCTTGACAG +CTTTTGGTGAGGAAGCACTTCCTTCCAACAGCTGTCTTCTTGGCAGAAAACCAAAACATT +GGCTTAAAGGGACCCACAGACTGGAACAGCCTCACATTTCGGCTTTAGAACAAATCCCAC +AATTGTTCAGCTTTCCGGTCCCCTTCAGATCAAGCAGAAGATATGTTTTGATTTTCATGC +TTGTATTTTAAACAATAATTTTCTACCCCAGCGTGGTAGTCAATGAGGAGAGAGGGGAAG +AATGCGCACATGATGCTACACGTTTCTGTTGTTGCTGTTATTATTGGTGGCTTTGAGGAG +AGCTGCTCCCATTTGGGGGTTTATACCAACTGTGGATTATGGCTTTGTCATTAAGATTTG +ATCTTTGTTAAATGAAAAACTGTTTATTGTATAAAACTCAGGTTTGTGGACGAAAAGTTG +TTTTTTTTCTTCAGTTAATTAAATTGTTCCTCAAGTTTGTTTAAGGACTTAAAATCAAAC +ACAACCATGTGTAAACTGCTAAATGAGGCTCCTAAAATGAGAGGCCTCAACTCTTTAAGT +GTGGAGCTAGAAATGTAAATAAGTCCACAGGGCAGACTGGTGATTATGATAAAAGCTACC +ATTTACTGAGCATCTGTCTACTAGGCTCAGCTCTATGCTAAGTCTACATGTTATCTGTCA +AAGTGGTATCATCCCCATTTAATAGCTGAGGAAACAGAGGCTTAGAAAGGCTGGGTAACT +TGACCAGGGTCATGCAACTAGTCTGCGGTGGAGCCAGGATTCTGTCTGACCCTAAAGGCC +AAGTTCTTTATATTTATTTCTACCACCTGCTAAAGTCTTGAATGGAGGCTGAAAGCACAG +TTGGGGTATGGGGAAGAAAAATATATATACATACATATATGTATATGTATGTATGTATGT +ATGGGGGGTTGTTTTGTTTTTGTTTTTGATAAGGAGTTTTGCTCTTGTTGCCCAGGCTGG +AGTGCAGTGGTATGATCTGGGCTCACTGCAACCTCCGCCTCCCGGGTTCAAGTCATTCTC +CTGCCTCAGCCTCCCGAGTAGCTGGGATTACCGGAGCATGCCACCACACCCAGCAAAGTT +TTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGATCTTGAACTCCTCA +TCTCAGGTGATCTGCCCGCCTCCGCTTCCCAAAGTGCTGGGATTACAGGTGTGAGTCACC +GCGTCCGGCCTACAGATATATTTAATTTAAAGAGATCTAAAACAAATACAAAACTGTCCA +CATCTATGTTGATGGACCCATAAAAATAGCAGTCTGCCAGGGTCTGCCGGAAGAGACAGA +TAAGCATACATATTAACATGGATATATATGTGAATTTCATTCAAATGGTTCTCACATGAG +AGTAACTAGCATCTTTCTCTCAGATGATGAAGATGATGAAGAGGAAGATGAAGAGGAAGA +AATCGACGTGGTCACTGTGGAGAAGCGGCGTTCCTCCTCCAACACCAAGGCTGTCACCAC +ATTCACCATCACTGTGCGTCCCAAGAACGCAGCCCTGGGTCCCGGGAGGGCTCAGTCCAG +CGAGCTGATCCTCAAACGATGCCTTCCCATCCACCAGCAGCACAACTATGCCGCCCCCTC +TCCCTACGTGGAGAGTGAGGATGCACCCCCACAGAAGAAGATAAAGAGCGAGGCGTCCCC +ACGTCCGCTCAAGAGTGTCATCCCCCCAAAGGCTAAGAGCTTGAGCCCCCGAAACTCTGA +CTCGGAGGACAGTGAGCGTCGCAGAAACCACAACATCCTGGAGCGCCAGCGCCGCAACGA +CCTTCGGTCCAGCTTTCTCACGCTCAGGGACCACGTGCCGGAGTTGGTAAAGAATGAGAA +GGCCGCCAAGGTGGTCATTTTGAAAAAGGCCACTGAGTATGTCCACTCCCTCCAGGCCGA +GGAGCACCAGCTTTTGCTGGAAAAGGAAAAATTGCAGGCAAGACAGCAGCAGTTGCTAAA +GAAAATTGAACACGCTCGGACTTGCTAGACGCTTCTCAAAACTGGACAGTCACTGCCACT +TTGCACATTTTGATTTTTTTTTTAAACAAACATTGTGTTGACATTAAGAATGTTGGTTTA +CTTTCAAATCGGTCCCCTGTCGAGTTCGGCTCTGGGTGGGCAGTAGGACCACCAGTGTGG +GGTTCTGCTGGGACCTTGGAGAGCCTGCATCCCAGGATGCTGGGTGGCCCTGCAGCCTCC +TCCACCTCACCTCCATGACAGCGCTAAACGTTGGTGACGGTTGGGAGCCTCTGGGGCTGT +TGAAGTCACCTTGTGTGTTCCAAGTTTCCAAACAACAGAAAGTCATTCCTTCTTTTTAAA +ATGGTGCTTAAGTTCCAGCAGATGCCACATAAGGGGTTTGCCATTTGATACCCCTGGGGA +ACATTTCTGTAAATACCATTGACACATCCGCCTTTTGTATACATCCTGGGTAATGAGAGG +TGGCTTTTGCGGCCAGTATTAGACTGGAAGTTCATACCTAAGTACTGTAATAATACCTCA +ATGTTTGAGGAGCATGTTTTGTATACAAATATATTGTTAATCTCTGTTATGTACTGTACT +AATTCTTACACTGCCTGTATACTTTAGTATGACGCTGATACATAACTAAATTTGATACTT +ATATTTTCGTATGAAAATGAGTTGTGAAAGTTTTGAGTAGATATTACTTTATCACTTTTT +GAACTAAGAAACTTTTGTAAAGAAATTTACTATATATATATGCCTTTTTCCTAGCCTGTT +TCTTCCTGTTAATGTATTTGTTCATGTTTGGTGCATAGAACTGGGTAAATGCAAAGTTCT +GTGTTTAATTTCTTCAAAATGTATATATTTAGTGCTGCATCTTATAGCACTTTGAAATAC +CTCATGTTTATGAAAATAAATAGCTTAAAATTAAATGA +>MT GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTT CGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTC GCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATT diff --git a/.test/resources/scenarios/illumina_circle_scenario.yaml b/.test/resources/scenarios/illumina_circle_scenario.yaml new file mode 120000 index 00000000..9e071a00 --- /dev/null +++ b/.test/resources/scenarios/illumina_circle_scenario.yaml @@ -0,0 +1 @@ +../../../resources/scenarios/illumina_circle_scenario.yaml \ No newline at end of file diff --git a/.test/resources/scenarios/nanopore_circle_scenario.yaml b/.test/resources/scenarios/nanopore_circle_scenario.yaml new file mode 120000 index 00000000..7c897dfa --- /dev/null +++ b/.test/resources/scenarios/nanopore_circle_scenario.yaml @@ -0,0 +1 @@ +../../../resources/scenarios/nanopore_circle_scenario.yaml \ No newline at end of file diff --git a/.test/resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml b/.test/resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml new file mode 120000 index 00000000..35e2e1ae --- /dev/null +++ b/.test/resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml @@ -0,0 +1 @@ +../../../resources/scenarios/nanopore_illumina_joint_circle_scenario.yaml \ No newline at end of file diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..4a1bffa7 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,45 @@ +cff-version: 1.2.0 +message: "If you use this workflow, please cite the article from preferred-citation and the workflow itself." +authors: +- family-names: "Hartmann" + given-names: "Till" + orcid: "https://orcid.org/0000-0002-6993-347X" +- family-names: "Lähnemann" + given-names: "David" + orcid: "https://orcid.org/0000-0002-9138-4112" +title: "snakemake-workflows/cyrcular-calling" +version: 1.2.0 +doi: Zenodo DOI TBD +date-released: 2023-04-06 +url: "https://github.com/snakemake-workflows/cyrcular-calling" +preferred-citation: + type: article + authors: + - family-names: "Tüns" + given-names: "Alicia Isabell" + orcid: "https://orcid.org/0000-0003-0025-1304" + - family-names: "Hartmann" + given-names: "Till" + orcid: "https://orcid.org/0000-0002-6993-347X" + - family-names: "Magin" + given-names: "Simon" + - family-names: "Chamorro González" + given-names: "Rocío" + orcid: "https://orcid.org/0000-0002-5653-7259" + - family-names: "Henssen" + given-names: "Anton George" + orcid: "https://orcid.org/0000-0003-1534-778X" + - family-names: "Rahmann" + given-names: "Sven" + orcid: "https://orcid.org/0000-0002-8536-6065" + - family-names: "Schramm" + given-names: "Alexander" + orcid: "https://orcid.org/0000-0001-7670-7529" + - family-names: "Köster" + given-names: "Johannes" + orcid: "https://orcid.org/0000-0001-9818-9320" + doi: "10.3389/fgene.2022.867018" + journal: "Frontiers in Genetics" + month: 05 + year: 2022 + title: "Detection and Validation of Circular DNA Fragments Using Nanopore Sequencing" diff --git a/README.md b/README.md index 5d7bcdad..ac81ccef 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # Snakemake workflow: cyrcular-calling [![Snakemake](https://img.shields.io/badge/snakemake-≥6.15.0-brightgreen.svg)](https://snakemake.github.io) - +[![Tests](https://github.com/snakemake-workflows/cyrcular-calling/actions/workflows/main.yaml/badge.svg)](https://github.com/snakemake-workflows/cyrcular-calling/actions/workflows/main.yaml) +[![Conventional Commits](https://img.shields.io/badge/Conventional%20Commits-1.0.0-%23FE5196?logo=conventionalcommits&logoColor=white)](https://conventionalcommits.org) A Snakemake workflow for ecDNA detection in nanopore sequencing reads derived from DNA samples enriched for circular DNA. diff --git a/config/config.yaml b/config/config.yaml index eb59b0aa..0868ac4e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,23 +1,20 @@ -calling: - samples: config/samples.tsv - units: config/units.tsv +samples: config/samples.tsv +units: config/units.tsv - reference: - # if `path` is not set, name should just be "genome" atm. - name: "example_ref" - # Custom path overrides everything below that, if file exists - # (in that case, path must match "resources/" + name + ".fasta") - path: "resources/example_ref.fasta" - # The first n entries of the FASTA will be considered. - n_chromosomes: 25 - # Ensembl species name - species: homo_sapiens - # Ensembl release - release: 100 - # Genome build - build: GRCh38 +reference: + # Ensembl species name + species: homo_sapiens + # Genome build + build: GRCh38 + # Ensembl release + release: 107 + # for available downloads, please browse either of these views: + # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasets.html + # * http://repeatmasker.org/genomicDatasets/RMGenomicDatasetsAlt.html + repeat_masker_download_link: "http://www.repeatmasker.org/genomes/hg38/RepeatMasker-rm406-dfam2.0/hg38.fa.out.gz" +cyrcular: # minimum read depth (used during covered segment identification) min_read_depth: 2 # minimum number of split reads (edges with less than this number will be removed from the graph) @@ -27,12 +24,15 @@ calling: # maximum deletion length which is encoded as an edge / a breakend event. Can be useful for unmapped regions in the reference. max_deletion_length: 10000 - filter: - # varlociraptor fdr control - fdr-control: - threshold: 0.1 - local: true - events: - circular: - varlociraptor: - - present +filter: + # varlociraptor fdr control + fdr-control: + threshold: 0.1 + mode: local-smart + events: + circular: + varlociraptor: + - present + circles: + min-length: 100 + max-length: 50_000_000 diff --git a/config/samples.tsv b/config/samples.tsv index 83b46ad0..b1c90139 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1,3 +1 @@ -group sample platform -Example Example nanopore -#Example Example_Illumina illumina +group sample_name platform diff --git a/config/units.tsv b/config/units.tsv index 9e486578..2cc2b5d4 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1,3 +1 @@ -sample unit fq1 fq2 -Example 0 resources/examples/example_reads.fastq.gz -#Example_illumina 0 /path/to/Example_illumina.1.fastq /path/to/Example_illumina.2.fastq +sample_name unit_name fq1 fq2 diff --git a/resources/examples/example_reads.fastq.gz b/resources/examples/example_reads.fastq.gz deleted file mode 100644 index 21534a9c..00000000 Binary files a/resources/examples/example_reads.fastq.gz and /dev/null differ diff --git a/workflow/Snakefile b/workflow/Snakefile index 21dd27e2..9eb38fe6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -12,12 +12,11 @@ scattergather: include: "rules/common.smk" -include: "rules/utils.smk" +include: "rules/ref.smk" include: "rules/map.smk" include: "rules/call.smk" include: "rules/annotate.smk" include: "rules/filter.smk" -include: "rules/table.smk" include: "rules/plot.smk" include: "rules/datavzrd.smk" diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml index 655c1073..0d0e8da5 100644 --- a/workflow/envs/bcftools.yaml +++ b/workflow/envs/bcftools.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - bcftools =1.12 + - bcftools =1.16 diff --git a/workflow/envs/blast.yaml b/workflow/envs/blast.yaml deleted file mode 100644 index f6d24bac..00000000 --- a/workflow/envs/blast.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - blast =2.11 diff --git a/workflow/envs/cyrcular.post-deploy.sh b/workflow/envs/cyrcular.post-deploy.sh index b082ae71..fe44fa11 100644 --- a/workflow/envs/cyrcular.post-deploy.sh +++ b/workflow/envs/cyrcular.post-deploy.sh @@ -1 +1,6 @@ -cargo install --git https://github.com/tedil/cyrcular.git --branch main --rev ec9fc44682134fad6cc3730eacfe3860cacf6dc3 --locked +# this is needed to have the `ruget` binary installed by `plotly_kaleido` +# available in the PATH, it is removed from PATH after installation of cyrcular +PATH="$HOME/.cargo/bin:$PATH" +# TODO: merge branch `annotate` in cyrcular and create bioconda recipe +cargo install --git https://github.com/tedil/cyrcular.git --branch annotate --rev 44ba5274b9f2aee7d193b774343acaccf0edec53 --locked --root "${CONDA_PREFIX}" +PATH=${PATH#[^:]*:} diff --git a/workflow/envs/cyrcular.yaml b/workflow/envs/cyrcular.yaml index da398401..90287a7f 100644 --- a/workflow/envs/cyrcular.yaml +++ b/workflow/envs/cyrcular.yaml @@ -2,7 +2,7 @@ channels: - conda-forge - bioconda dependencies: - - rust =1.58.1 + - rust =1.67.1 - git =2.35.0 - compilers =1.3.0 - pkg-config =0.29.2 @@ -11,7 +11,7 @@ dependencies: - gsl =2.7 - libcblas =3.9.0 - openssl =3.0.0 - - zlib =1.2.11 + - zlib >=1.2.12,<1.3 - bzip2 =1.0.8 - xz =5.2.5 - clangdev =13.0.0 diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml index ae558e60..abcf0e78 100644 --- a/workflow/envs/datavzrd.yaml +++ b/workflow/envs/datavzrd.yaml @@ -1,4 +1,4 @@ channels: - conda-forge dependencies: - - datavzrd =1.16 \ No newline at end of file + - datavzrd =2.18 diff --git a/workflow/envs/excel.yaml b/workflow/envs/excel.yaml deleted file mode 100644 index 22cac73c..00000000 --- a/workflow/envs/excel.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - openpyxl =3.0.7 - - pandas =1.2.5 diff --git a/workflow/envs/gff.yaml b/workflow/envs/gff.yaml deleted file mode 100644 index 1d6e7dba..00000000 --- a/workflow/envs/gff.yaml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - bcftools =1.13 - - pysam - - gff3sort - - pigz =2.6 diff --git a/workflow/envs/graphviz.yaml b/workflow/envs/graphviz.yaml index 288eafe8..7fee320c 100644 --- a/workflow/envs/graphviz.yaml +++ b/workflow/envs/graphviz.yaml @@ -1,6 +1,5 @@ channels: - - bioconda - conda-forge dependencies: - - graphviz =2.49.0 + - graphviz =7.1 - imagemagick =7.1 diff --git a/workflow/envs/nanosim.yaml b/workflow/envs/nanosim.yaml deleted file mode 100644 index 72c8b779..00000000 --- a/workflow/envs/nanosim.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - nanosim ==3.0 diff --git a/workflow/envs/canu.yaml b/workflow/envs/pandas.yaml similarity index 57% rename from workflow/envs/canu.yaml rename to workflow/envs/pandas.yaml index 75637a5b..3b4a8191 100644 --- a/workflow/envs/canu.yaml +++ b/workflow/envs/pandas.yaml @@ -1,5 +1,4 @@ channels: - - bioconda - conda-forge dependencies: - - canu =2.1.1 + - pandas =2.0 diff --git a/workflow/envs/bash.yaml b/workflow/envs/pigz.yaml similarity index 74% rename from workflow/envs/bash.yaml rename to workflow/envs/pigz.yaml index e32d4a8b..35aa8a3c 100644 --- a/workflow/envs/bash.yaml +++ b/workflow/envs/pigz.yaml @@ -1,4 +1,4 @@ channels: - conda-forge dependencies: - - bash =5.1 + - pigz =2.6 diff --git a/workflow/envs/flye.yaml b/workflow/envs/python.yaml similarity index 57% rename from workflow/envs/flye.yaml rename to workflow/envs/python.yaml index f0560735..b4f05da5 100644 --- a/workflow/envs/flye.yaml +++ b/workflow/envs/python.yaml @@ -1,5 +1,4 @@ channels: - - bioconda - conda-forge dependencies: - - flye =2.8.3 + - python =3.11 diff --git a/workflow/envs/rbt.yaml b/workflow/envs/rbt.yaml index c1a22ee7..9041876b 100644 --- a/workflow/envs/rbt.yaml +++ b/workflow/envs/rbt.yaml @@ -1,6 +1,6 @@ channels: - - bioconda - conda-forge + - bioconda dependencies: - - rust-bio-tools =0.39.1 - - bcftools =1.14 + - rust-bio-tools =0.42 + - bcftools =1.16 diff --git a/workflow/envs/samtools.yaml b/workflow/envs/samtools.yaml deleted file mode 100644 index 3a89d373..00000000 --- a/workflow/envs/samtools.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - samtools =1.12 diff --git a/workflow/envs/table.yaml b/workflow/envs/table.yaml deleted file mode 100644 index 8d237599..00000000 --- a/workflow/envs/table.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - pandas >=1.3.4 - - pandarallel >=1.5.4 - - dinopy >=2.2.0 diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml index 3b89eff9..9455e39c 100644 --- a/workflow/envs/varlociraptor.yaml +++ b/workflow/envs/varlociraptor.yaml @@ -1,4 +1,6 @@ channels: - conda-forge + - bioconda dependencies: - - varlociraptor =5.2.0 + - varlociraptor =7.0 + - bcftools =1.16 \ No newline at end of file diff --git a/workflow/envs/vcf_annotate.yaml b/workflow/envs/vcf_annotate.yaml index 29f9b102..4fc6aa00 100644 --- a/workflow/envs/vcf_annotate.yaml +++ b/workflow/envs/vcf_annotate.yaml @@ -2,7 +2,5 @@ channels: - conda-forge - bioconda dependencies: - - bcftools =1.15 - - htslib =1.15 - - vembrane =0.12 + - bcftools =1.16 - ripgrep =13.0 diff --git a/workflow/envs/vembrane.yaml b/workflow/envs/vembrane.yaml deleted file mode 100644 index 84517e33..00000000 --- a/workflow/envs/vembrane.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - vembrane =0.7 - - bcftools =1.12 diff --git a/workflow/envs/wget.yaml b/workflow/envs/wget.yaml index 7a924a6b..7672f299 100644 --- a/workflow/envs/wget.yaml +++ b/workflow/envs/wget.yaml @@ -1,4 +1,4 @@ channels: - conda-forge dependencies: - - wget =1.20.3 + - wget =1.20 diff --git a/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml b/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml index 55da5f25..277f1301 100644 --- a/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml +++ b/workflow/resources/datavzrd/circle-table-template.datavzrd.yaml @@ -1,100 +1,173 @@ name: Circle calls __definitions__: - - | - event_link = f""" - function(value, row) {{ - var filename = value.replace("_circle_", "_"); - var link = `${{value}}`; - return link; - }} - """ - - | - graph_link = f""" - function(value, row) {{ - var filename = value; - var link = `${{value}}`; - return link; - }} - """ + # TODO: once yaml-specified parameters become available for custom-path + # functions in datavzrd, move breakpoint_link to separate: + # workflow/scripts/breakpoint_link_formatter.js - | - gene_link = f""" + breakpoint_link = f""" function(value, row) {{ - $(function () {{ - $('[data-toggle="tooltip"]').tooltip() - }}); - let genes = value.split(";"); - let rows = ""; - for (let g of genes) {{ - rows = `${{rows}}${{g}}`; - }}; - let table = `
${{rows}}
Genes
`; - let button = ` - - `; - return button; + let primer_blast_url = `https://www.ncbi.nlm.nih.gov/tools/primer-blast/index.cgi?LINK_LOC=bookmark&INPUT_SEQUENCE=${{value}}&PRIMER5_END=900&PRIMER3_START=1100&PRIMER_PRODUCT_MIN=200&PRIMER_PRODUCT_MAX=1200&ORGANISM={config["reference"]["species"].replace("_", "%20")}&NEWWIN=on&NEWWIN=on&SHOW_SVIEWER=true`; + let seq = "…" + value.substring(value.length / 2 - 10, value.length / 2 + 10) + "…"; + let link = `${{seq}}`; + return link; }} """ + # TODO: once yaml-specified parameters become available for custom-path + # functions in datavzrd, move regions_links to separate: + # workflow/scripts/regions_links_formatter.js - | - breakpoint_link = f""" + regions_links = f""" function(value, row) {{ - var primer_blast_url = `https://www.ncbi.nlm.nih.gov/tools/primer-blast/index.cgi?LINK_LOC=bookmark&INPUT_SEQUENCE=${{value}}&PRIMER5_END=900&PRIMER3_START=1100&PRIMER_PRODUCT_MIN=200&PRIMER_PRODUCT_MAX=1200&ORGANISM=Homo%20sapiens&NEWWIN=on&NEWWIN=on&SHOW_SVIEWER=true`; - var seq = "…" + value.substring(value.length / 2 - 10, value.length / 2 + 10) + "…"; - var link = `${{seq}}`; - return link; + let regions = value.split(","); + let links = []; + for (let region of regions) {{ + let ensembl_url = `https://www.ensembl.org/{config["reference"]["species"]}/Location/Overview?r=${{region}}`; + let link = `${{region}}`; + links.push(link); + }} + let result = links.join(", "); + return result; }} """ +default-view: "summary-plot" + datasets: - ?for group, path in zip(params.groups, params.calls): - ?f"{group}-circles": + ?for category, path in params.overview_tables: + ?f"data-{category}-circles": + path: ?path + separator: "\t" + links: + circle-details: + column: event_id + view: ?f"detail-{wildcards.group}-{{value}}-segments" + ?for group, circle, path in params.detail_tables: + ?f"data-{group}-{circle}-segments": path: ?path - separator: "," + separator: "\t" + "overview-table": + path: ?input.categorized_overview_table + separator: "\t" + links: + link to category: + column: category + view: "circles-{value}" + # link to circle: + # column: event_id + # table-row: ?f"data-{CATEGORY}-circles/event_id" views: - ?for group in params.groups: - ?f"circles-{group}": - desc: | - Overview table showing all discovered circles for a group. - dataset: ?f"{group}-circles" + ?for category in params.categories: + ?f"circles-{category}": + desc: ?f"""Overview table showing all discovered circles for category `{category}`.\n + + | Column | Description |\n + | --------------------- | ------------------------------------------------------------------------------- |\n + | `event_id` | Id of the event, format `{{graph_id}}-{{circle_id}}` |\n + | `graph_id` | Id of the graph the circle belongs to |\n + | `circle_id` | Id of the circle in its parent graph |\n + | `segment_count` | Number of segments with coverage |\n + | `regions` | Genomic regions of only those segments with coverage |\n + | `num_exons` | Number of exons in coverage segments |\n + | `gene_names` | Genes overlapping with coverage segments |\n + | `regulatory_features` | Regulatory features overlapping with coverage segments |\n + | `repeats` | Repeats known to repeatMasker overlapping with coverage segments |\n + | `num_split_reads` | Total number of split reads on all split edges of the circle |\n + | `prob_present` | Probability of a circle being present as calculated by varlociraptor |\n + | `prob_absent` | Probability of a circle being absent as calculated by varlociraptor |\n + | `prob_artifact` | Probability of a circle being deemed an artefact as calculated by varlociraptor |\n + | `af_*` | Allele frequency of the circle for the respective sample |\n + | `category` | The circle's category, either of `coding`, `regulatory` or `intronic` |\n + | last column | Link to a detailed view of all of the circle's segments |\n + + """ + dataset: ?f"data-{category}-circles" + render-table: + columns: + "graph_id": + custom-path: ?input.graph_link_formatter + "circle_id": + custom-path: ?input.circle_qc_plot_link_formatter + "regions": + custom: ?regions_links + "regex('prob_(.+)')": + plot: + heatmap: + scale: linear + domain: [0.0, 1.0] + range: + - white + - "#c1d7a8" + "gene_names": + custom-path: ?input.gene_card_link_formatter + "gene_ids": + display-mode: hidden + "regex('af_(.+)')": + plot: + ticks: + scale: "linear" + domain: [0.0, 1.0] + aux-domain-columns: + - "regex('af_(.+)')" + + ?for group, circle, _ in params.detail_tables: + ?f"detail-{group}-{circle}-segments": + hidden: true + desc: ?f"""Detail table containing all covered segments and circle junctions for circle **{circle}** of group *{group}*.\n + + | Column | Description |\n + | --------------------- | ------------------------------------------------------------------------------- |\n + | `graph_id` | Id of the graph the circle belongs to |\n + | `circle_id` | Id of the circle in its parent graph |\n + | `kind` | Whether the edge is a `coverage` or a `split` edge |\n + | `target_from` | The name of the sequence from which the edge starts |\n + | `from` | The genomic locus in `target_from` from which the edge starts |\n + | `target_to` | The name of the sequence at which the edge ends |\n + | `to` | The genomic locus in `target_to` at which the edge ends |\n + | `length` | Length of the edge in bp, if applicable |\n + | `num_exons` | Number of exons in a coverage segment |\n + | `gene_names` | Genes overlapping with a coverage segment |\n + | `regulatory_features` | Regulatory features overlapping with a coverage segment |\n + | `repeats` | Repeats known to repeatMasker overlapping with a coverage segment |\n + | `coverage` | Mean read depth for the edge in the circle |\n + | `num_split_reads` | Number of split reads for the edge in the circle |\n + | `breakpoint_sequence` | Link to a NCBI primer-blast with form's fields pre-filled |\n + + """ + dataset: ?f"data-{group}-{circle}-segments" render-table: - direction: - display-mode: hidden - "regex('AF_(.+)')": - plot: - ticks: - scale: "linear" - domain: [0.0, 1.0] - aux-domain-columns: - - "regex('AF_(.+)')" - "circle_length": - plot: - ticks: - scale: "linear" - aux-domain-columns: - - "circle_length" - "length": - plot: - ticks: - scale: "linear" - aux-domain-columns: - - "length" - "regex('PROB_(.+)')": - plot: - heatmap: - scale: linear - domain: [0.0, 1.0] - range: - - white - - "#c1d7a8" - "GENES": - custom: ?gene_link - "breakpoint_seq": - custom: ?breakpoint_link - "EVENT": - custom: ?event_link - "graph": - custom: ?graph_link - "NUM_EXONS": - display-mode: hidden + columns: + "graph_id": + custom-path: ?input.graph_link_formatter + "circle_id": + custom-path: ?input.circle_qc_plot_link_formatter + "kind": + plot: + heatmap: + scale: ordinal + color-scheme: accent + "breakpoint_sequence": + custom: ?breakpoint_link + "gene_names": + custom-path: ?input.gene_card_link_formatter + "gene_ids": + display-mode: hidden + + "summary-plot": + desc: | + Circle length distributions by category. + Click on any of the bars to get to the respective category's overview table (or choose directly from the "views" dropdown menu). + + Categories are defined as follows: + | Category | Condition | + | ------------ | --------------------------------------------------------------- | + | `coding` | at least 1 exon | + | `regulatory` | not `coding` and at least 1 regulatory feature annotated | + | `intronic` | neither `coding` nor `regulatory` and at least 1 gene annotated | + | `other` | none of the above | + + dataset: ?f"overview-table" + render-plot: + spec-path: ?input.summary_spec \ No newline at end of file diff --git a/workflow/resources/datavzrd/summary_plot.json b/workflow/resources/datavzrd/summary_plot.json new file mode 100644 index 00000000..fd020c6e --- /dev/null +++ b/workflow/resources/datavzrd/summary_plot.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "autosize": { + "type": "fit", + "contains": "padding" + }, + "title": { + "text": "Number of circles and circle lengths by category" + }, + "transform": [ + { + "calculate": "indexof(['coding', 'regulatory', 'intronic', 'other'], datum.category)", + "as": "order" + } + ], + "vconcat": [ + { + "width": 800, + "resolve": { + "scale": { + "x": "independent", + "y": "independent" + } + }, + "mark": { + "type": "bar", + "tooltip": true + }, + "encoding": { + "x": { + "field": "category", + "aggregate": "count", + "stack": true, + "title": "number of circles" + }, + "color": { + "field": "category", + "sort": [ + "coding", + "regulatory", + "intronic", + "other" + ], + "scale": { + "type": "ordinal", + "scheme": "set1", + "domain": [ + "coding", + "regulatory", + "intronic", + "other" + ] + } + }, + "order": { + "field": "order", + "type": "nominal" + }, + "href": { + "field": "link to category" + } + } + }, + { + "mark": { + "type": "bar" + }, + "height": 150, + "width": 200, + "resolve": { + "scale": { + "x": "shared", + "y": "shared" + } + }, + "encoding": { + "x": { + "bin": { + "minstep": 500, + "maxbins": 100 + }, + "field": "circle_length", + "type": "quantitative" + }, + "y": { + "title": "circle count", + "aggregate": "count", + "type": "quantitative" + }, + "color": { + "field": "category", + "type": "nominal", + "sort": [ + "coding", + "regulatory", + "intronic", + "other" + ], + "scale": { + "type": "ordinal", + "scheme": "set1", + "domain": [ + "coding", + "regulatory", + "intronic", + "other" + ] + } + }, + "column": { + "field": "category", + "type": "nominal", + "sort": [ + "coding", + "regulatory", + "intronic", + "other" + ] + }, + "order": { + "field": "order", + "type": "nominal" + }, + "href": { + "field": "link to category" + } + } + } + ] +} \ No newline at end of file diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index c2c461be..65b56a54 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -1,101 +1,118 @@ -rule extract_vcf_header_lines_for_bcftools_annotate: +## annotate genes, exons and breakpoint sequences; produce one overview table containing circles, and one table with details of each segment for each circle + + +rule cyrcular_generate_tables: input: - vcf="results/calling/candidates/{sample}.sorted.bcf", + reference=rules.get_genome.output.genome, + graph="results/calling/graphs/{group}.annotated.graph", + bcf="results/calling/calls/filtered_fdr/reheader/{group}.bcf", output: - header=temp("results/calling/annotation/{sample}.header_lines.txt"), - params: - fields="|".join(CYRCULAR_INFO_FIELDS), - conda: - "../envs/vcf_annotate.yaml" + overview="results/calling/tables/{group}/{group}_overview.tsv", + details=directory("results/calling/tables/{group}/{group}_details/"), + threads: 1 log: - "logs/re-annotate/header_{sample}.log", + "logs/cyrcular_generate_tables/{group}.log", + benchmark: + "benchmarks/cyrcular_generate_tables/{group}.txt" + conda: + "../envs/cyrcular.yaml" shell: - """ - bcftools view -h {input.vcf} | rg {params.fields:q} > {output.header} 2> {log} - """ + """cyrcular graph table {input.graph} {input.bcf} --reference {input.reference} --circle-table {output.overview} --segment-tables {output.details} 2> {log}""" -rule copy_annotation_from_cyrcular: +rule cyrcular_annotate_graph: input: - variants="results/calling/calls/pre_annotated/{sample}.bcf", - variants_index="results/calling/calls/pre_annotated/{sample}.bcf.csi", - candidates_with_annotation="results/calling/candidates/{sample}.sorted.bcf", - candidates_with_annotation_index="results/calling/candidates/{sample}.sorted.bcf.csi", - header_lines="results/calling/annotation/{sample}.header_lines.txt", + reference=rules.get_genome.output.genome, + graph="results/calling/graphs/{group}.graph", + gene_annotation="resources/gene_annotation.gff3.gz", + regulatory_annotation="resources/regulatory_annotation.gff3.gz", + repeat_annotation=lambda wc: "resources/repeat_masker.fa.out.gz" + if config["reference"].get("repeat_masker_download_link", "") + else "", output: - variants="results/calling/calls/annotated/{sample}.bcf", - annotation=temp("results/calling/candidates/{sample}.sorted.bcf.tab"), - annotation_bgzip=temp("results/calling/candidates/{sample}.sorted.bcf.tab.bgz"), - annotation_bgzip_tabix=temp( - "results/calling/candidates/{sample}.sorted.bcf.tab.bgz.tbi" - ), + annotated="results/calling/graphs/{group}.annotated.graph", + threads: 1 log: - "logs/re-annotate/{sample}.log", + "logs/cyrcular_annotate_graph/{group}.log", benchmark: - "benchmarks/re-annotate/{sample}.txt" + "benchmarks/cyrcular_annotate_graph/{group}.txt" conda: - "../envs/vcf_annotate.yaml" + "../envs/cyrcular.yaml" params: - header=copy_annotation_vembrane_header_expr(), - table_expr=copy_annotation_table_expr(), - columns=copy_annotation_bcftools_annotate_columns(), + repeat_annotation=lambda wc, input: f" --repeat-annotation {input.repeat_annotation} " + if config["reference"].get("repeat_masker_download_link", "") + else "", + shell: + "cyrcular graph annotate " + " --reference {input.reference} " + " --gene-annotation {input.gene_annotation} " + " --regulatory-annotation {input.regulatory_annotation} " + " --output {output.annotated} " + " {input.graph} " + "2> {log} " + + +rule reheader_filtered_bcf: + input: + bcf="results/calling/calls/filtered_fdr/{group}.bcf", + sorted_header="results/calling/calls/filtered_fdr/reheader/{group}.header.sorted.txt", + output: + bcf="results/calling/calls/filtered_fdr/reheader/{group}.bcf", + log: + "logs/reheader_filtered_bcf/{group}.log", + conda: + "../envs/bcftools.yaml" shell: + ## bcftools re-header seems to re-order entries + # bcftools reheader --header {input.sorted_header} --output {output.bcf} {input.bcf} + ## so we have to re-header ourselves + ## TODO: reinvestigate to find a cleaner solution """ - vembrane table --header {params.header:q} {params.table_expr:q} {input.candidates_with_annotation} > {output.annotation} 2> {log} - bgzip -c {output.annotation} > {output.annotation_bgzip} 2>> {log} - tabix -p vcf --zero-based -S 1 -f {output.annotation_bgzip} 2>> {log} - bcftools annotate --header-lines {input.header_lines} --annotations {output.annotation_bgzip} --columns {params.columns} --output-type b --output {output.variants} {input.variants} 2>> {log} + cat {input.sorted_header} <(bcftools view -H {input.bcf}) | bcftools view -Ob > {output.bcf} 2> {log} """ -rule annotate_genes: +rule sort_bcf_header: input: - annotation="resources/gencode.v38.annotation.sorted.gff3.gz", - variants="results/calling/calls/merged/{sample}.bcf", + bcf="results/calling/calls/filtered_fdr/{group}.bcf", + header="results/calling/calls/filtered_fdr/{group}.header.txt", output: - variants="results/calling/calls/pre_annotated/{sample}.bcf", - threads: 1 + sorted_header="results/calling/calls/filtered_fdr/reheader/{group}.header.sorted.txt", log: - "logs/annotate_genes/{sample}.log", - benchmark: - "benchmarks/annotate_genes/{sample}.txt" + "logs/sort_bcf_header/{group}.log", conda: - "../envs/gff.yaml" + "../envs/python.yaml" script: - "../scripts/gff_annotate.py" + "../scripts/sort_bcf_header.py" -rule sort_annotation: +rule get_bcf_header: input: - "resources/gencode.v38.annotation.gff3.gz", + bcf="{file}.bcf", output: - gff="resources/gencode.v38.annotation.sorted.gff3.gz", - tbi="resources/gencode.v38.annotation.sorted.gff3.gz.tbi", - csi="resources/gencode.v38.annotation.sorted.gff3.gz.csi", - conda: - "../envs/gff.yaml" + header="{file}.header.txt", log: - "logs/sort_annotation/gencode.v38.annotation.gff3.log", - benchmark: - "benchmarks/sort_annotation/gencode.v38.annotation.gff3.txt" - threads: 48 + "logs/get_bcf_header/{file}.log", + conda: + "../envs/bcftools.yaml" shell: """ - gff3sort.pl <(pigz -dc {input}) | bgzip -@ {threads} -c > {output.gff} 2> {log} - tabix {output.gff} 2>> {log} - tabix --csi {output.gff} 2>> {log} + bcftools view -h {input.bcf} > {output.header} 2> {log} """ -rule download_annotation: +rule extract_vcf_header_lines_for_bcftools_annotate: + input: + vcf="results/calling/candidates/{sample}.sorted.bcf", output: - "resources/gencode.v38.annotation.gff3.gz", - log: - "logs/download_annotation/gencode.v38.annotation.gff3.log", - benchmark: - "benchmarks/download_annotation/gencode.v38.annotation.gff3.txt" - cache: True + header=temp("results/calling/annotation/{sample}.header_lines.txt"), + params: + fields="|".join(CYRCULAR_INFO_FIELDS), conda: - "../envs/wget.yaml" + "../envs/vcf_annotate.yaml" + log: + "logs/re-annotate/header_{sample}.log", shell: - """wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz --no-check-certificate -O {output} 2> {log}""" + """ + bcftools view -h {input.vcf} | rg {params.fields:q} > {output.header} 2> {log} + """ diff --git a/workflow/rules/call.smk b/workflow/rules/call.smk index 532b5a9b..92cb9b58 100644 --- a/workflow/rules/call.smk +++ b/workflow/rules/call.smk @@ -10,7 +10,7 @@ rule bcf_index: benchmark: "benchmarks/bcftools/index/{prefix}.txt" wrapper: - "v1.0.0/bio/bcftools/index" + "v1.25.0/bio/bcftools/index" rule bcftools_concat: @@ -32,7 +32,7 @@ rule bcftools_concat: uncompressed_bcf=False, extra="-a", # optional parameters for bcftools concat (except -o) wrapper: - "v1.0.0/bio/bcftools/concat" + "v1.25.0/bio/bcftools/concat" rule bcftools_sort: @@ -40,13 +40,12 @@ rule bcftools_sort: "results/calling/calls/initial/{group}.{scatteritem}.bcf", output: "results/calling/calls/initial_sorted/{group}.{scatteritem}.bcf", - threads: 2 log: "logs/sort_calls/{group}.{scatteritem}.log", benchmark: "benchmarks/sort_calls/{group}.{scatteritem}.txt" wrapper: - "v1.0.0/bio/bcftools/sort" + "v1.25.0/bio/bcftools/sort" rule varlociraptor_call: @@ -72,8 +71,8 @@ rule varlociraptor_call: rule varlociraptor_alignment_properties: input: - ref=config["calling"]["reference"]["path"], - ref_idx=config["calling"]["reference"]["path"] + ".fai", + ref=rules.get_genome.output.genome, + ref_idx=rules.genome_faidx.output.index, bam="results/calling/mapping/{sample}.bam", output: "results/calling/alignment-properties/{sample}.json", @@ -87,8 +86,8 @@ rule varlociraptor_alignment_properties: rule varlociraptor_preprocess: input: - ref=config["calling"]["reference"]["path"], - ref_idx=config["calling"]["reference"]["path"] + ".fai", + ref=rules.get_genome.output.genome, + ref_idx=rules.genome_faidx.output.index, candidates=get_group_candidates, bam="results/calling/mapping/{sample}.bam", bai="results/calling/mapping/{sample}.bam.bai", @@ -135,20 +134,20 @@ rule sort_bnd_bcfs: "results/calling/candidates/{sample}.bcf", output: "results/calling/candidates/{sample}.sorted.bcf", - threads: 2 log: "logs/sort_bnds/{sample}.log", benchmark: "benchmarks/sort_bnds/{sample}.txt" wrapper: - "v1.0.0/bio/bcftools/sort" + "v1.25.0/bio/bcftools/sort" rule circle_bnds: input: bam="results/calling/mapping/{sample}.bam", bai="results/calling/mapping/{sample}.bam.bai", - ref=config["calling"]["reference"]["path"], + ref=rules.get_genome.output.genome, + ref_index=rules.genome_faidx.output.index, output: bnds="results/calling/candidates/{sample}.bcf", graph="results/calling/graphs/{sample}.graph", @@ -159,10 +158,10 @@ rule circle_bnds: "benchmarks/cyrcular/{sample}.txt" threads: 4 params: - min_read_depth=config["calling"]["min_read_depth"], # 2 - min_split_reads=config["calling"]["min_split_reads"], # 5 - max_paths_per_component=config["calling"]["max_paths_per_component"], # 15 - max_deletion_length=config["calling"]["max_deletion_length"], # 10000, + min_read_depth=config["cyrcular"]["min_read_depth"], # 2 + min_split_reads=config["cyrcular"]["min_split_reads"], # 5 + max_paths_per_component=config["cyrcular"]["max_paths_per_component"], # 15 + max_deletion_length=config["cyrcular"]["max_deletion_length"], # 10000, conda: "../envs/cyrcular.yaml" shell: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index de0d6f17..2c32dcde 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -2,31 +2,25 @@ import pandas as pd import re from snakemake.utils import validate -REFERENCE = config["calling"]["reference"]["name"] - validate(config, schema="../schemas/config.schema.yaml") -wildcard_constraints: - sample="[a-zA-Z_0-9-]+", - - def read_samples(): samples = ( pd.read_csv( - config["calling"]["samples"], + config["samples"], sep="\t", - dtype={"sample": str, "group": str}, + dtype={"sample_name": str, "group": str}, comment="#", ) - .set_index("sample", drop=False) + .set_index("sample_name", drop=False) .sort_index() ) def _group_or_sample(row): group = row.get("group", None) if pd.isnull(group): - return row["sample"] + return row["sample_name"] return group samples["group"] = [_group_or_sample(row) for _, row in samples.iterrows()] @@ -35,18 +29,25 @@ def read_samples(): samples = read_samples() +SAMPLES = list(sorted(set(samples["sample_name"]))) GROUPS = list(sorted(set(samples["group"]))) +CATEGORIES = ["coding", "regulatory", "intronic", "other"] + + +wildcard_constraints: + sample="|".join(SAMPLES), + group="|".join(GROUPS), def read_units(): units = ( pd.read_csv( - config["calling"]["units"], + config["units"], sep="\t", - dtype={"sample": str, "unit": str}, + dtype={"sample_name": str, "unit_name": str}, comment="#", ) - .set_index(["sample", "unit"], drop=False) + .set_index(["sample_name", "unit_name"], drop=False) .sort_index() ) validate(units, schema="../schemas/units.schema.yaml") @@ -58,12 +59,17 @@ units = read_units() def get_all_input(wildcards): targets = [] + targets += expand("results/datavzrd-report/{group}.fdr-controlled", group=GROUPS) + targets += expand( + "results/tmp/{group}.{category}.qc_plots.marker", + group=GROUPS, + category=CATEGORIES, + ) targets += expand( - "results/calling/tables/{group}.sorted.annotated.csv", group=GROUPS + "results/tmp/{group}.{category}.graph_plots.marker", + group=GROUPS, + category=CATEGORIES, ) - targets += ["results/datavzrd-report/all.fdr-controlled"] - targets += expand("results/tmp/{group}.qc_plots.marker", group=GROUPS) - targets += expand("results/tmp/{group}.graph_plots.marker", group=GROUPS) return targets @@ -82,17 +88,17 @@ def get_group_candidates(wildcards): scenario = scenario_name(wildcards) if scenario == "nanopore_only": sample = list( - samples.query(f"group == '{group}' & platform == 'nanopore'")["sample"] + samples.query(f"group == '{group}' & platform == 'nanopore'")["sample_name"] )[0] return f"results/calling/candidate-calls/{sample}.{{scatteritem}}.bcf" elif scenario == "illumina_only": sample = list( - samples.query(f"group == '{group}' & platform == 'illumina'")["sample"] + samples.query(f"group == '{group}' & platform == 'illumina'")["sample_name"] )[0] return f"results/calling/candidate-calls/{sample}.{{scatteritem}}.bcf" elif scenario == "nanopore_with_illumina_support": sample = list( - samples.query(f"group == '{group}' & platform == 'nanopore'")["sample"] + samples.query(f"group == '{group}' & platform == 'nanopore'")["sample_name"] )[0] return f"results/calling/candidate-calls/{sample}.{{scatteritem}}.bcf" else: @@ -122,17 +128,17 @@ def get_observations(wildcards): observations = [] - has_nanopore = len(s.query("platform == 'nanopore'")["sample"]) > 0 - has_illumina = len(s.query("platform == 'illumina'")["sample"]) > 0 + has_nanopore = len(s.query("platform == 'nanopore'")["sample_name"]) > 0 + has_illumina = len(s.query("platform == 'illumina'")["sample_name"]) > 0 if has_nanopore: - for sample_nanopore in list(s.query("platform == 'nanopore'")["sample"]): + for sample_nanopore in list(s.query("platform == 'nanopore'")["sample_name"]): observations.append( f"results/calling/calls/observations/{sample_nanopore}.{{scatteritem}}.bcf" ) if has_illumina: - for sample_illumina in list(s.query("platform == 'illumina'")["sample"]): + for sample_illumina in list(s.query("platform == 'illumina'")["sample_name"]): observations.append( f"results/calling/calls/observations/{sample_illumina}.{{scatteritem}}.bcf" ) @@ -211,9 +217,7 @@ def get_fastqs(wildcards): # black wasn't happy about the inline version of this in the params section def varlociraptor_filtering_mode(wildcards): - return ( - "--local" if config["calling"]["filter"]["fdr-control"]["local"] is True else "" - ) + return config["filter"]["fdr-control"].get("mode", "local-smart") class Region: @@ -263,15 +267,27 @@ def parse_bnd_alt(s: str): CYRCULAR_INFO_FIELDS = ["CircleLength", "CircleSegmentCount", "SplitReads", "Support"] -def copy_annotation_table_expr(): - return "CHROM,POS,ID,REF,ALT," + ",".join( - map(lambda s: f"INFO['{s}']", CYRCULAR_INFO_FIELDS) - ) - - -def copy_annotation_vembrane_header_expr(): - return "CHROM,POS,ID,REF,ALT," + ",".join(CYRCULAR_INFO_FIELDS) - - -def copy_annotation_bcftools_annotate_columns(): - return "CHROM,POS,~ID,REF,ALT," + ",".join(CYRCULAR_INFO_FIELDS) +def get_detail_tables_group_circle_path_for_report(wildcards, input): + from pathlib import Path + + group = wildcards.group + res = [] + folder = input.detail_tables + group_tsv = pd.read_csv(input.categorized_overview_table, sep="\t") + keep_event_ids = set(group_tsv["event_id"]) + detail_table_files = [f for f in os.listdir(folder) if f.endswith(".tsv")] + event_ids = [ + f"{m.group(1)}-{m.group(2)}" + for m in [ + re.match("graph_([^_]+)_circle_([^_]+)_segments.tsv", path) + for path in detail_table_files + ] + ] + for event_id, detail_file in zip(event_ids, detail_table_files): + if event_id not in keep_event_ids: + continue + f = pd.read_csv(f"{folder}/{detail_file}", sep="\t") + if f.empty: + continue + res.append((group, event_id, f"{folder}/{detail_file}")) + return res diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index 3e3c9975..5c7dc5e7 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -1,17 +1,44 @@ +from pathlib import Path + + +# TODO: rethink overview_tables and detail_tables +# I am not really sure how to do this well, but I think we could have +# just one circle-centered and one segment-centered table each, with +# proper filtering (probably also in the linkouts from the overview plot_ +# and with current plot linkouts for graph and circle as collapsed / +# hidden plots right there in the table. We'll need to discuss. rule render_datavzrd_config: input: template=workflow.source_path( "../resources/datavzrd/circle-table-template.datavzrd.yaml" ), + summary_spec=workflow.source_path("../resources/datavzrd/summary_plot.json"), + categorized_overview_table="results/calling/tables/{group}/{group}_categorized_overview.tsv", + overview_tables=expand( + "results/calling/tables/{{group}}/{{group}}_overview.{category}.tsv", + category=CATEGORIES, + ), + detail_tables="results/calling/tables/{group}/{group}_details/", + circle_qc_plot_link_formatter=workflow.source_path( + "../scripts/circle_qc_plot_link_formatter.js" + ), + graph_link_formatter=workflow.source_path("../scripts/graph_link_formatter.js"), + gene_card_link_formatter=workflow.source_path( + "../scripts/gene_card_link_formatter.js" + ), output: - "resources/datavzrd/all.datavzrd.yaml", + "results/datavzrd/{group}.datavzrd.yaml", params: - groups=lambda wc: GROUPS, - calls=lambda wc: [ - f"results/calling/tables/{group}.sorted.annotated.csv" for group in GROUPS + categories=CATEGORIES, + overview_tables=lambda wc, input: [ + (file.split("_overview.")[1].replace(".tsv", ""), file) + for file in list(input.overview_tables) ], + detail_tables=lambda wc, input: get_detail_tables_group_circle_path_for_report( + wc, input + ), log: - "logs/datavzrd_render/all.log", + "logs/datavzrd_render/{group}.log", template_engine: "yte" @@ -19,62 +46,59 @@ rule render_datavzrd_config: rule copy_qc_plots_for_datavzrd: input: plots="results/calling/coverage_graphs/{group}", - report="results/datavzrd-report/all.fdr-controlled", + overview="results/calling/tables/{group}/{group}_overview.{category}.tsv", + report="results/datavzrd-report/{group}.fdr-controlled", output: - marker="results/tmp/{group}.qc_plots.marker", + marker="results/tmp/{group}.{category}.qc_plots.marker", params: output_dir=lambda wc: directory( - f"results/datavzrd-report/all.fdr-controlled/circles-{wc.group}/qc_plots" + f"results/datavzrd-report/{wc.group}.fdr-controlled/qc_plots" ), log: - "logs/datavzrd/copy_qc_plots/{group}.log", + "logs/datavzrd/copy_qc_plots/{group}.{category}.log", conda: - "../envs/bash.yaml" - shell: - """ - mkdir -p {params.output_dir} 2> {log} - for f in `find {input.plots} -regex '.*/graph_[0-9]+_[0-9]+\.html'`; do ( cp "$f" {params.output_dir} ); done - touch {output.marker} 2>> {log} - """ + "../envs/pandas.yaml" + script: + "../scripts/copy_qc_plots.py" rule copy_graph_plots_for_datavzrd: input: plots="results/calling/graphs/rendered/{group}", - report="results/datavzrd-report/all.fdr-controlled", + overview="results/calling/tables/{group}/{group}_overview.{category}.tsv", + report="results/datavzrd-report/{group}.fdr-controlled", output: - marker="results/tmp/{group}.graph_plots.marker", + marker="results/tmp/{group}.{category}.graph_plots.marker", params: output_dir=lambda wc: directory( - f"results/datavzrd-report/all.fdr-controlled/circles-{wc.group}/graphs" + f"results/datavzrd-report/{wc.group}.fdr-controlled/graphs" ), log: - "logs/datavzrd/copy_graph_plots/{group}.log", + "logs/datavzrd/copy_graph_plots/{group}.{category}.log", conda: - "../envs/bash.yaml" - shell: - """ - mkdir -p {params.output_dir} - for f in `find {input.plots} -regex '.*/graph_[0-9]+\.pdf'`; do ( cp "$f" {params.output_dir} ); done - touch {output.marker} - """ + "../envs/pandas.yaml" + script: + "../scripts/copy_graph_plots.py" rule datavzrd_circle_calls: input: - config="resources/datavzrd/all.datavzrd.yaml", - calls=expand( - "results/calling/tables/{group}.sorted.annotated.csv", group=GROUPS + config="results/datavzrd/{group}.datavzrd.yaml", + overview_tables=expand( + "results/calling/tables/{group}/{group}_overview.{category}.tsv", + category=CATEGORIES, + allow_missing=True, ), + detail_tables="results/calling/tables/{group}/{group}_details/", output: report( - directory("results/datavzrd-report/all.fdr-controlled"), + directory("results/datavzrd-report/{group}.fdr-controlled"), htmlindex="index.html", category="Circle calls", ), conda: "../envs/datavzrd.yaml" log: - "logs/datavzrd_report/all.log", + "logs/datavzrd_report/{group}.log", shell: "datavzrd {input.config} --output {output} &> {log}" diff --git a/workflow/rules/filter.smk b/workflow/rules/filter.smk index 098e0b30..f9d81cd4 100644 --- a/workflow/rules/filter.smk +++ b/workflow/rules/filter.smk @@ -1,23 +1,37 @@ -rule filter_calls: +rule filter_overview_table: input: - "results/calling/calls/annotated/{group}.bcf", + table="results/calling/tables/{group}/{group}_overview.tsv", output: - calls="results/calling/calls/filtered/{group}.bcf", - by_exons=temp("results/calling/calls/filtered/{group}.at_least_one_exon.bcf"), + coding="results/calling/tables/{group}/{group}_overview.coding.tsv", + regulatory="results/calling/tables/{group}/{group}_overview.regulatory.tsv", + intronic="results/calling/tables/{group}/{group}_overview.intronic.tsv", + other="results/calling/tables/{group}/{group}_overview.other.tsv", + categorized="results/calling/tables/{group}/{group}_categorized_overview.tsv", log: - "logs/filter-calls/{group}.log", + "logs/filter_overview_table/{group}.log", + conda: + "../envs/pandas.yaml" + script: + "../scripts/filter_overview_table.py" + + +rule filter_varlociraptor: + input: + calls="results/calling/calls/merged/{group}.bcf", + output: + fdr_calls="results/calling/calls/filtered_fdr/{group}.bcf", + log: + "logs/filter-varlociraptor/{group}.log", benchmark: - "benchmarks/filter-calls/{group}.txt" + "benchmarks/filter-varlociraptor/{group}.txt" conda: - "../envs/vembrane.yaml" + "../envs/varlociraptor.yaml" threads: 1 params: - fdr=config["calling"]["filter"]["fdr-control"].get("threshold", 0.05), + fdr=config["filter"]["fdr-control"].get("threshold", 0.05), mode=varlociraptor_filtering_mode, - filter_expression="True", # 'INFO["NUM_EXONS"] != 0', shell: """ - vembrane filter '{params.filter_expression}' {input} -O bcf > {output.by_exons} 2> {log} - varlociraptor filter-calls control-fdr {params.mode} --events PRESENT --var BND --fdr {params.fdr} {output.by_exons} | varlociraptor decode-phred | bcftools sort -m 4G -Ob > {output.calls} 2>> {log} + varlociraptor filter-calls control-fdr --mode {params.mode} --events PRESENT --var BND --fdr {params.fdr} {input.calls} | varlociraptor decode-phred | bcftools sort -m 4G -Ob > {output.fdr_calls} 2> {log} """ diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index bbefd76c..8fce0193 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -1,7 +1,6 @@ - rule minimap2_bam: input: - target=f"results/calling/index/{REFERENCE}.mmi", # can be either genome index or genome fasta + target=rules.minimap2_index.output.index, # can be either genome index or genome fasta query=get_minimap2_input, output: "results/calling/mapping/{sample}.bam", @@ -12,27 +11,10 @@ rule minimap2_bam: params: extra=get_minimap2_mapping_params, # optional sorting="coordinate", # optional: Enable sorting. Possible values: 'none', 'queryname' or 'coordinate' - sort_extra=lambda wc: f"-@ {workflow.cores * 0.5}", # optional: extra arguments for samtools/picard - threads: workflow.cores * 0.5 - wrapper: - "v1.0.0/bio/minimap2/aligner" - - -rule minimap2_index: - input: - target=config["calling"]["reference"]["path"], - output: - f"results/calling/index/{REFERENCE}.mmi", - log: - f"logs/minimap2_index/{REFERENCE}.log", - benchmark: - f"benchmarks/minimap2_index/{REFERENCE}.txt" - params: - extra="", # optional additional args - cache: True - threads: workflow.cores + sort_extra=lambda wc, threads: f"-@ {min(threads, 4)}", # optional: extra arguments for samtools/picard + threads: workflow.cores // 2 wrapper: - "v1.0.0/bio/minimap2/index" + "v1.25.0/bio/minimap2/aligner" rule merge_fastqs: @@ -68,7 +50,7 @@ rule samtools_index: # Samtools takes additional threads through its option -@ threads: 12 # This value - 1 will be sent to -@ wrapper: - "v1.0.0/bio/samtools/index" + "v1.25.0/bio/samtools/index" rule samtools_faidx: @@ -85,4 +67,4 @@ rule samtools_faidx: # Samtools takes additional threads through its option -@ threads: 12 # This value - 1 will be sent to -@ wrapper: - "v1.0.0/bio/samtools/faidx" + "v1.25.0/bio/samtools/faidx" diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk new file mode 100644 index 00000000..752802db --- /dev/null +++ b/workflow/rules/ref.smk @@ -0,0 +1,106 @@ +rule get_genome: + output: + genome=expand( + "resources/{species}.{build}.{release}.fasta", + species=config["reference"]["species"], + build=config["reference"]["build"], + release=config["reference"]["release"], + ), + log: + "logs/get-genome.log", + params: + species=config["reference"]["species"], + datatype="dna", + build=config["reference"]["build"], + release=config["reference"]["release"], + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "v1.25.0/bio/reference/ensembl-sequence" + + +rule genome_faidx: + input: + rules.get_genome.output.genome, + output: + index=expand( + "{genome}.fai", + genome=rules.get_genome.output.genome, + ), + log: + "logs/genome-faidx.log", + cache: True + wrapper: + "v1.25.0/bio/samtools/faidx" + + +rule minimap2_index: + input: + target=rules.get_genome.output.genome, + output: + index=expand( + "resources/{species}.{build}.{release}.mmi", + species=config["reference"]["species"], + build=config["reference"]["build"], + release=config["reference"]["release"], + ), + log: + "logs/minimap2_index/genome.log", + benchmark: + "benchmarks/minimap2_index/genome.txt" + params: + extra="", # optional additional args + cache: True + # Minimap2 uses at most three threads when indexing target sequences: + # https://lh3.github.io/minimap2/minimap2.html + threads: 3 + wrapper: + "v1.25.0/bio/minimap2/index" + + +# TODO: create new ENSEMBL-REGULATORY-ANNOTATION snakemake wrapper +rule download_regulatory_annotation: + output: + "resources/regulatory_annotation.gff3.gz", + log: + "logs/download_regulatory_annotation.log", + params: + release=config["reference"].get("release", "107"), + benchmark: + "benchmarks/download_regulatory_annotation.txt" + cache: "omit-software" # save space and time with between workflow caching (see docs) + conda: + "../envs/wget.yaml" + shell: + """wget https://ftp.ensembl.org/pub/release-{params.release}/regulation/homo_sapiens/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20220201.gff.gz --no-check-certificate -O {output} 2> {log}""" + + +rule download_repeatmasker_annotation: + output: + "resources/repeat_masker.fa.out.gz", + log: + "logs/download_repeatmasker_annotation.log", + params: + download_link=config["reference"].get("repeat_masker_download_link", ""), + benchmark: + "benchmarks/download_repeatmasker_annotation.txt" + cache: "omit-software" # save space and time with between workflow caching (see docs) + conda: + "../envs/wget.yaml" + shell: + """wget {params.download_link} --no-check-certificate -O {output} 2> {log}""" + + +rule download_gene_annotation: + output: + "resources/gene_annotation.gff3.gz", + params: + species=config["reference"]["species"], + build=config["reference"]["build"], + release=config["reference"]["release"], + flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. + branch="", # optional: specify branch + log: + "logs/download_gene_annotation.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "v1.25.0/bio/reference/ensembl-annotation" diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk deleted file mode 100644 index 83897faa..00000000 --- a/workflow/rules/report.smk +++ /dev/null @@ -1,25 +0,0 @@ - -rule csv_report: - input: - csv="results/calling/tables/{group}.sorted.annotated.csv", - plots="results/calling/coverage_graphs/{group}", - output: - report( - directory("results/calling/report/tables/{group}"), - category="Circle calls", - htmlindex="index.html", - ), - params: - extra="--formatter resources/report-table-formatter.js", - log: - "logs/rbt-csv-report/{group}.log", - benchmark: - "benchmarks/rbt-csv-report/{group}.txt" - conda: - "../envs/rbt.yaml" - shell: - """ - mkdir -p {output}/qc_plots - for f in {input.plots}/*.html; do ( cp "$f" {output}/qc_plots/ ); done - rbt csv-report {input.csv} {output} {params.extra} &> {log} - """ diff --git a/workflow/rules/table.smk b/workflow/rules/table.smk deleted file mode 100644 index 846b4c04..00000000 --- a/workflow/rules/table.smk +++ /dev/null @@ -1,69 +0,0 @@ -from pathlib import Path - - -rule annotate_breakpoint_sequence: - input: - csv="results/calling/tables/{group}.sorted.csv", - reference=config["calling"]["reference"]["path"], - output: - csv="results/calling/tables/{group}.sorted.annotated.csv", - log: - "logs/sort_and_tidy_tsv/{group}.log", - benchmark: - "benchmarks/sort_and_tidy_tsv/{group}.txt" - threads: 8 - conda: - "../envs/table.yaml" - script: - "../scripts/annotate_breakpoint_sequence.py" - - -rule sort_and_tidy_tsv: - input: - tsv="results/calling/tables/{group}.tsv", - output: - csv="results/calling/tables/{group}.sorted.csv", - log: - "logs/sort_and_tidy_tsv/{group}.log", - benchmark: - "benchmarks/sort_and_tidy_tsv/{group}.txt" - conda: - "../envs/table.yaml" - script: - "../scripts/sort_and_tidy_tsv.py" - - -rule vembrane_table: - input: - vcf="results/calling/calls/filtered/{group}.bcf", - output: - tsv="results/calling/tables/{group}.tsv", - threads: 1 - params: - expression=", ".join( - [ - 'INFO["EVENT"]', - "ID", - "CHROM", - "POS", - "ALT", - 'INFO["CircleLength"]', - 'INFO["CircleSegmentCount"]', - 'INFO["SplitReads"]', - 'INFO["NUM_EXONS"]', - '";".join(INFO["GENES"] or [])', - 'for_each_sample(lambda s: FORMAT["AF"][s])', - 'INFO["PROB_PRESENT"]', - 'INFO["PROB_ABSENT"]', - 'INFO["PROB_ARTIFACT"]', - ] - ), - extra=lambda wc: "--header \"EVENT, ID, CHROM, POS, ALT, circle_length, num_segments, split_reads, NUM_EXONS, GENES, for_each_sample(lambda s: 'AF_' + s), PROB_PRESENT, PROB_ABSENT, PROB_ARTIFACT\"", - log: - "logs/vembrane_table/{group}.log", - benchmark: - "benchmarks/vembrane_table/{group}.txt" - conda: - "../envs/vembrane.yaml" - shell: - """vembrane table {params.extra} '{params.expression}' {input.vcf} > {output.tsv}""" diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk deleted file mode 100644 index 422575e8..00000000 --- a/workflow/rules/utils.smk +++ /dev/null @@ -1,25 +0,0 @@ -rule get_genome: - output: - "resources/genome.fasta", - log: - "logs/get-genome.log", - params: - species=config["calling"]["reference"]["species"], - datatype="dna", - build=config["calling"]["reference"]["build"], - release=config["calling"]["reference"]["release"], - cache: True - wrapper: - "v1.0.0/bio/reference/ensembl-sequence" - - -rule genome_faidx: - input: - f"resources/{config['calling']['reference']['name']}.fasta", - output: - f"resources/{config['calling']['reference']['name']}.fasta.fai", - log: - "logs/genome-faidx.log", - cache: True - wrapper: - "v1.0.0/bio/samtools/faidx" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 9832ff1d..cd6dd73c 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -5,10 +5,6 @@ description: snakemake configuration file type: object definitions: - filterentry: - type: object - additionalProperties: - type: string evententry: type: object properties: @@ -25,32 +21,26 @@ definitions: properties: - calling: + samples: + type: string + units: + type: string + reference: type: object properties: - samples: + species: + type: string + release: + type: integer + build: type: string - reference: - type: object - properties: - name: - type: string - path: - type: string - n_chromosomes: - type: integer - species: - type: string - release: - type: integer - build: - type: string - required: - - name - - species - - release - - build - - n_chromosomes + required: + - species + - release + - build + cyrcular: + type: object + properties: min_read_depth: type: number default: 2 @@ -67,26 +57,30 @@ properties: type: number default: 10000 minimum: 1 - filter: - fdr-control: - type: object - properties: - threshold: - type: number - minimum: 0.0 - maximum: 1.0 - local: - type: boolean - events: - $ref: "#/definitions/evententry" - description: "a map of pairs" - required: - - threshold - - events required: - - samples - - reference - - filter - + - min_read_depth + - min_split_reads + - max_paths_per_component + - max_deletion_length + filter: + fdr-control: + type: object + properties: + threshold: + type: number + minimum: 0.0 + maximum: 1.0 + local: + type: boolean + events: + $ref: "#/definitions/evententry" + description: "a map of pairs" + required: + - threshold + - events required: - - calling \ No newline at end of file + - samples + - units + - reference + - cyrcular + - filter diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml index 06496e56..e3a15f87 100644 --- a/workflow/schemas/samples.schema.yaml +++ b/workflow/schemas/samples.schema.yaml @@ -1,4 +1,4 @@ -$schema: "http://json-schema.org/draft-04/schema#" +$schema: "http://json-schema.org/draft-07/schema#" description: an entry in the sample sheet properties: @@ -6,7 +6,7 @@ properties: type: string description: group name/identifier (alphanumeric string, that may additionally contain '_' and '-'). One sample can consist of multiple units. pattern: "^[a-zA-Z_0-9-]+$" - sample: + sample_name: type: string description: sample name/identifier (alphanumeric string, that may additionally contain '_' and '-') pattern: "^[a-zA-Z_0-9-]+$" @@ -20,5 +20,5 @@ properties: required: - group - - sample - - platform \ No newline at end of file + - sample_name + - platform diff --git a/workflow/schemas/units.schema.yaml b/workflow/schemas/units.schema.yaml index 660e0ede..37a08c58 100644 --- a/workflow/schemas/units.schema.yaml +++ b/workflow/schemas/units.schema.yaml @@ -1,13 +1,13 @@ -$schema: "http://json-schema.org/draft-04/schema#" +$schema: "http://json-schema.org/draft-07/schema#" description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data type: object properties: - sample: + sample_name: type: string pattern: "^[a-zA-Z_0-9-]+$" description: sample name/id the unit has been sequenced from (alphanumeric string, that may additionally contain '_' and '-') - unit: + unit_name: type: string pattern: "^[a-zA-Z_0-9-]+$" description: unit id (alphanumeric string, that may additionally contain '_' and '-') @@ -20,6 +20,6 @@ properties: description: path to second FASTQ file (leave empty in case of single-end) required: - - sample - - unit - - fq1 \ No newline at end of file + - sample_name + - unit_name + - fq1 diff --git a/workflow/scripts/annotate_breakpoint_sequence.py b/workflow/scripts/annotate_breakpoint_sequence.py deleted file mode 100644 index edfd1cb7..00000000 --- a/workflow/scripts/annotate_breakpoint_sequence.py +++ /dev/null @@ -1,72 +0,0 @@ -import pandas as pd -from pandarallel import pandarallel -from dinopy import FastaReader - -pandarallel.initialize(nb_workers=snakemake.threads, progress_bar=True) - - -def sort_table(df: pd.DataFrame): - ordered = ( - df.groupby(["EVENT"])[["PROB_PRESENT", "length", "split_reads"]] - .max() - .sort_values(["PROB_PRESENT", "length", "split_reads"], ascending=False) - .index - ) - categories = list(ordered) - df["EVENT"] = pd.Categorical(df["EVENT"], categories=categories) - df.sort_values("EVENT", inplace=True) - return df - - -def main(snakemake): - df = pd.read_csv(snakemake.input.csv) - if df.empty: - df["breakpoint_seq"] = None - df.to_csv(snakemake.output.csv, index=False, sep=",") - exit(0) - - far = FastaReader(snakemake.input.reference, write_fai=True) - - # TODO: needs dir information as well. - def get_seq(chrom1, start, chrom2, stop, direction, up_down=1000): - if direction == "from": - chrom_to, pos_to = chrom1, start - 1 - chrom_from, pos_from = chrom2, stop - 1 - elif direction == "to": - chrom_to, pos_to = chrom2, stop - 1 - chrom_from, pos_from = chrom1, start - 1 - else: - print("foo") - raise ValueError("unknown direction") - - entry_from = next(far[chrom_from]) - entry_to = next(far[chrom_to]) - chrom_from_len = entry_from.length - chrom_to_len = entry_to.length - - from_start = max(0, pos_from - up_down) - from_stop = pos_from - - to_start = pos_to - to_stop = min(pos_to + up_down, chrom_to_len - 1) - - seq_from = entry_from.sequence[from_start:from_stop].decode() - seq_to = entry_to.sequence[to_start:to_stop].decode() - return seq_from + seq_to - - df["breakpoint_seq"] = df.parallel_apply( - lambda row: get_seq( - row["CHROM"], - row["start"], - row["OTHER_CHROM"], - row["stop"], - row["direction"], - ), - axis=1, - ) - df = sort_table(df) - df.to_csv(snakemake.output.csv, index=False, sep=",") - - -if __name__ == "__main__": - main(snakemake) diff --git a/workflow/scripts/annotate_breakpoint_sequence.rs b/workflow/scripts/annotate_breakpoint_sequence.rs deleted file mode 100644 index e8ac9e09..00000000 --- a/workflow/scripts/annotate_breakpoint_sequence.rs +++ /dev/null @@ -1,21 +0,0 @@ -//! This is a regular crate doc comment, but it also contains a partial -//! Cargo manifest. Note the use of a *fenced* code block, and the -//! `cargo` "language". -//! -//! ```cargo -//! [dependencies] -//! csv = "1.1" -//! rust-bio = "*" -//! ``` - - -use anyhow::Result; - -fn main() -> Result<()> { - let _stdout_redirect = snakemake.redirect_stdout(snakemake.log.stdout)?; - println!("This will be written to path/to/stdout.log"); - - // redirect stderr to "path/to/stderr.log" - let _stderr_redirect = snakemake.redirect_stderr(snakemake.log.stderr)?; - Ok(()) -} \ No newline at end of file diff --git a/workflow/scripts/circle_qc_plot_link_formatter.js b/workflow/scripts/circle_qc_plot_link_formatter.js new file mode 100644 index 00000000..efd1cb44 --- /dev/null +++ b/workflow/scripts/circle_qc_plot_link_formatter.js @@ -0,0 +1,6 @@ +function circle_qc_plot_link_formatter(value, row) { + let graph_id = row["graph_id"] + let circle_id = row["circle_id"] + let link = `${value}`; + return link; +} diff --git a/workflow/scripts/copy_graph_plots.py b/workflow/scripts/copy_graph_plots.py new file mode 100644 index 00000000..8151438c --- /dev/null +++ b/workflow/scripts/copy_graph_plots.py @@ -0,0 +1,17 @@ +from contextlib import redirect_stderr + +with open(snakemake.log[0], "w") as logfile: + with redirect_stderr(logfile): + import os + import shutil + import pandas as pd + from pathlib import Path + + os.makedirs(snakemake.params.output_dir, exist_ok=True) + overview = pd.read_csv(snakemake.input.overview, sep="\t") + graph_ids = set(overview["graph_id"]) + for graph_id in graph_ids: + shutil.copy( + f"{snakemake.input.plots}/graph_{graph_id}.pdf", snakemake.params.output_dir + ) + Path(snakemake.output.marker).touch() diff --git a/workflow/scripts/copy_qc_plots.py b/workflow/scripts/copy_qc_plots.py new file mode 100644 index 00000000..9ea74325 --- /dev/null +++ b/workflow/scripts/copy_qc_plots.py @@ -0,0 +1,18 @@ +from contextlib import redirect_stderr + +with open(snakemake.log[0], "w") as logfile: + with redirect_stderr(logfile): + import os + import shutil + import pandas as pd + from pathlib import Path + + os.makedirs(snakemake.params.output_dir, exist_ok=True) + overview = pd.read_csv(snakemake.input.overview, sep="\t") + event_ids = {s.replace("-", "_") for s in overview["event_id"]} + for event_id in event_ids: + shutil.copy( + f"{snakemake.input.plots}/graph_{event_id}.html", + snakemake.params.output_dir, + ) + Path(snakemake.output.marker).touch() diff --git a/workflow/scripts/filter_overview_table.py b/workflow/scripts/filter_overview_table.py new file mode 100644 index 00000000..83f2faf0 --- /dev/null +++ b/workflow/scripts/filter_overview_table.py @@ -0,0 +1,24 @@ +from contextlib import redirect_stderr + +with open(snakemake.log[0], "w") as logfile: + with redirect_stderr(logfile): + import pandas as pd + + df = pd.read_csv(snakemake.input.table, sep="\t") + df.loc[:, ["gene_names", "gene_ids", "regulatory_features"]] = df.loc[ + :, ["gene_names", "gene_ids", "regulatory_features"] + ].fillna("") + df["category"] = df[["num_exons", "regulatory_features", "gene_names"]].apply( + lambda r: "coding" + if r["num_exons"] > 0 + else ( + "regulatory" + if r["regulatory_features"] + else ("intronic" if r["gene_names"] else "other") + ), + axis=1, + ) + for kind in ["coding", "regulatory", "intronic", "other"]: + part = df.query(f"category == '{kind}'") + part.to_csv(getattr(snakemake.output, kind), sep="\t", index=False) + df.to_csv(snakemake.output.categorized, sep="\t", index=False) diff --git a/workflow/scripts/gene_card_link_formatter.js b/workflow/scripts/gene_card_link_formatter.js new file mode 100644 index 00000000..5887acde --- /dev/null +++ b/workflow/scripts/gene_card_link_formatter.js @@ -0,0 +1,15 @@ +function gene_card_link_formatter(value, row) { + $(function () { + $('[data-toggle="tooltip"]').tooltip() + }); + let genes = value.split(","); + let rows = ""; + for (let g of genes) { + rows = `${rows}${g}`; + }; + let table = `
${rows}
Genes
`; + let button = ` + + `; + return button; +} diff --git a/workflow/scripts/gff_annotate.py b/workflow/scripts/gff_annotate.py deleted file mode 100644 index a17397ef..00000000 --- a/workflow/scripts/gff_annotate.py +++ /dev/null @@ -1,69 +0,0 @@ -from typing import Tuple - -from pysam import ( - VariantFile, - TabixFile, - asGFF3, - VariantRecord, -) -import re - - -def parse_attribute(s: str) -> Tuple[str, str]: - t = s.split("=", maxsplit=1) - return t[0], t[1] - - -BND_RE = re.compile(r""".*([]\[])((?P.+):(?P[0-9]+))([]\[])?.*""") - - -def parse_bnd_alt(s: str) -> Tuple[str, int]: - return BND_RE.search(s)["seqname"], int(BND_RE.search(s)["position"]) - - -gff_in: TabixFile = TabixFile(snakemake.input["annotation"], parser=asGFF3()) - -with VariantFile(snakemake.input["variants"]) as bcf_in: - header = bcf_in.header - info_genes = r"""##INFO=""" - header.add_line(info_genes) - info_exons = r"""##INFO=""" - header.add_line(info_exons) - - with VariantFile(snakemake.output["variants"], "w", header=header) as bcf_out: - for record in bcf_in: - record: VariantRecord = record - start, end = record.start, record.stop - if "BND" in record.info["SVTYPE"]: - mate_chrom, mate_pos = parse_bnd_alt(record.alts[0]) - if mate_chrom == record.chrom: - end = mate_pos - start, end = min(start, end), max(start, end) - num_exons = 0 - try: - genes = set() - exons = set() - try: - for annot in gff_in.fetch(f"chr{record.chrom}", start, end): - attributes = dict(map(parse_attribute, annot.attributes.split(";"))) - genes.add(attributes["gene_name"]) - if "exon_number" in attributes and ( - annot.start >= start and annot.end <= end - ): - exons.add(attributes["exon_number"]) - except ValueError: - for annot in gff_in.fetch(f"{record.chrom}", start, end): - attributes = dict(map(parse_attribute, annot.attributes.split(";"))) - genes.add(attributes["gene_name"]) - if "exon_number" in attributes and ( - annot.start >= start and annot.end <= end - ): - exons.add(attributes["exon_number"]) - if len(genes) > 0: - record.info["GENES"] = list(genes) - record.info["NUM_EXONS"] = len(set(exons)) - except ValueError: - # Usually means: no annotations for a specific region - pass - finally: - bcf_out.write(record) diff --git a/workflow/scripts/graph_link_formatter.js b/workflow/scripts/graph_link_formatter.js new file mode 100644 index 00000000..07c1d955 --- /dev/null +++ b/workflow/scripts/graph_link_formatter.js @@ -0,0 +1,5 @@ +function graph_link_formatter(value, row) { + let graph_id = value; + let link = `${value}`; + return link; +} diff --git a/workflow/scripts/sort_and_tidy_tsv.py b/workflow/scripts/sort_and_tidy_tsv.py deleted file mode 100644 index e5fd05c6..00000000 --- a/workflow/scripts/sort_and_tidy_tsv.py +++ /dev/null @@ -1,113 +0,0 @@ -import pandas as pd -from math import ceil -import re - - -BND_RE = re.compile(r""".*([]\[])((?P.+):(?P[0-9]+))([]\[])?.*""") - - -def parse_bnd_alt(s: str): - return BND_RE.search(s)["seqname"], int(BND_RE.search(s)["position"]) - - -def sort_table(df: pd.DataFrame): - ordered = ( - df.groupby(["EVENT"])[["PROB_PRESENT", "length", "split_reads"]] - .max() - .sort_values(["PROB_PRESENT", "length", "split_reads"], ascending=False) - .index - ) - categories = list(ordered) - df["EVENT"] = pd.Categorical(df["EVENT"], categories=categories) - df.sort_values("EVENT", inplace=True) - return df - - -def main(snakemake): - df = pd.read_csv(snakemake.input.tsv, sep="\t") - columns = [ - "graph", - "EVENT", - "CHROM", - "start", - "OTHER_CHROM", - "stop", - "ALT", - "direction", - "circle_length", - "num_segments", - "split_reads", - "AF_nanopore", - "PROB_PRESENT", - "PROB_ABSENT", - "PROB_ARTIFACT", - "length", - "NUM_EXONS", - "GENES", - ] - # if there are no entries, make sure an empty table with the correct header is written anyways - if df.empty: - df = pd.DataFrame(columns=columns) - df.to_csv(snakemake.output.csv, index=False) - return - - # the list of genes may be too long for excel spreadsheets, hence we split the column into multiple ones - # from https://stackoverflow.com/questions/58645152/splitting-a-long-string-in-pandas-cell-near-the-n-th-character-position-into-mul - limit = 32767 - 1 - sep = ";" - longest = df["GENES"].apply(lambda s: len(s) if isinstance(s, str) else 0).max() - num_seps = ( - df["GENES"].apply(lambda s: s.count(sep) if isinstance(s, str) else 0).max() - ) - num_cols = max(1, ceil(longest / (limit - num_seps))) - - for index, row in df.iterrows(): - gene_names = row["GENES"] - num_chars = len(gene_names) if isinstance(gene_names, str) else 0 - if num_chars == 0: - continue - genes = gene_names.split(sep) - num_genes = len(genes) - col_index = ceil(len(genes) / num_cols) - - for _ in range(num_cols - 1): - genes.insert(col_index, "|") - col_index += col_index - new = sep.join(genes) - df.at[index, "GENES"] = new - - df[["OTHER_CHROM", "stop"]] = pd.DataFrame( - df["ALT"].apply(parse_bnd_alt).tolist(), index=df.index - ) - df["direction"] = df["ALT"].apply( - lambda a: "to" if "[" in a else ("from" if "]" in a else "*") - ) - df.rename(columns=dict(POS="start"), inplace=True) - - df["length"] = df["circle_length"] - df.query("CHROM != OTHER_CHROM")["length"] = float("nan") - df.drop(columns=["ID"], inplace=True) - - new_cols = [f"GENES_{i + 1}" for i in range(num_cols)] - df[new_cols] = df["GENES"].astype(dtype=str).str.split("|", expand=True) - df[new_cols] = df[new_cols].apply(lambda s: s.str.strip(sep)) - del df["GENES"] - df.rename(columns=dict(GENES_1="GENES"), inplace=True) - gene_cols = [c for c in df.columns if c.startswith("GENES")] - - df = sort_table(df) - - df.drop_duplicates( - subset=["EVENT", "CHROM", "start", "OTHER_CHROM", "stop"], inplace=True - ) - - df["graph"] = [re.sub(r"_circle_.*", "", d) for d in df["EVENT"]] - - # reorder columns - df = df[columns[:-1] + gene_cols] - - df.to_csv(snakemake.output.csv, index=False, sep=",") - - -if __name__ == "__main__": - main(snakemake) diff --git a/workflow/scripts/sort_bcf_header.py b/workflow/scripts/sort_bcf_header.py new file mode 100644 index 00000000..72bdff07 --- /dev/null +++ b/workflow/scripts/sort_bcf_header.py @@ -0,0 +1,40 @@ +with open(snakemake.input.header, "rt") as header_file: + header = [l.strip() for l in header_file.readlines()] + file_format_line = header[0] + chrom_line = header[-1] + other_lines = header[1:-1] + kinds = [ + "fileDate", + "source", + "reference", + "contig", + "phasing", + "FILTER", + "INFO", + "FORMAT", + "ALT", + "assembly", + "META", + "SAMPLE", + "PEDIGREE", + "pedigreeDB", + ] + categories = {kind: [] for kind in kinds} + others = [] + for line in other_lines: + if "=" in line: + kind = line.split("=")[0].lstrip("#") + group = categories.get(kind, others) + else: + group = others + group.append(line) + + with open(snakemake.output.sorted_header, "wt") as out: + print(file_format_line, file=out) + for kind in kinds: + lines = categories[kind] + for line in lines: + print(line, file=out) + for line in others: + print(line, file=out) + print(chrom_line, file=out) diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py deleted file mode 100644 index dec56efa..00000000 --- a/workflow/scripts/tsv_to_xlsx.py +++ /dev/null @@ -1,7 +0,0 @@ -import sys -import pandas as pd - -sys.stderr = open(snakemake.log[0], "w") - -data = pd.read_csv(snakemake.input.tsv, sep="\t") -data.to_excel(snakemake.output.xlsx, index=False)