let user provide their annotations

tdayris · Dec 1, 2023 · e256f74 · e256f74
1 parent 6056c4a
commit e256f74
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+# 2.2.3
+
+## Features:
+
+* Conditionally load fair_genome_indexer
+* Let user provide annotation files
+
 # 2.2.2
 
 ## Features:

diff --git a/config/README.md b/config/README.md
@@ -30,16 +30,16 @@ params:
     metrics: ""
 ```
 
-# `genomes.csv`
+# `samples.csv`
 
 A CSV-formatted text file containing the following mandatory columns:
 
-* sample_id: Unique name of the sample
-* upstream_file: Path to upstream fastq file
-* species: The species name, according to Ensembl standards
-* build: The corresponding genome build, according to Ensembl standards
-* release: The corresponding genome release, according to Ensembl standards
-* downstream_file: Optional path to downstream fastq file
+* `sample_id`: Unique name of the sample
+* `upstream_file`: Path to upstream fastq file
+* `species`: The species name, according to Ensembl standards
+* `build`: The corresponding genome build, according to Ensembl standards
+* `release`: The corresponding genome release, according to Ensembl standards
+* `downstream_file`: Optional path to downstream fastq file
 
 Example:
 
@@ -48,6 +48,37 @@ sample_id,upstream_file,downstream_file,species,build,release
 sac_a,data/reads/a.scerevisiae.1.fq,data/reads/a.scerevisiae.2.fq,saccharomyces_cerevisiae,R64-1-1,110
 ```
 
+While `CSV` format is tested and recommended, this workflow uses python
+`csv.Sniffer()` to detect column separator. Tabulation and semicolumn are
+also accepted as field separator. Remember that only comma-separator is
+tested.
+
+# `genomes.csv`
+
+This file is fully optional. When missing, the genome sequences
+will be downloaded from Ensembl and indexed.
+
+A CSV-formatted text file containing the following mandatory columns:
+
+* `species`: The species name, according to Ensembl standards
+* `build`: The corresponding genome build, according to Ensembl standards
+* `release`: The corresponding genome release, according to Ensembl standards
+
+The following columns are optional and are used to avoid downloading genomes:
+
+* `fasta`: Path to the reference genome sequence (FASTA formatted)
+* `fasta_index`: Path to the reference genome sequence index (FAI formatted)
+* `bowtie2_index`: Path to the main directory containing reference index
+
+Example:
+
+```
+species,build,release,fasta,fasta_index,bowtie2_index
+homo_sapiens,GRCh38,110,/path/to/sequence.fasta,/path/to/sequence.fasta.fai,/path/to/bowtie2_sequence/
+mus_musculus,GRCm38,99,,,
+mus_musculus,GRCm39,110,,,
+```
+
 While `CSV` format is tested and recommended, this workflow uses python
 `csv.Sniffer()` to detect column separator. Tabulation and semicolumn are
 also accepted as field separator. Remember that only comma-separator is

diff --git a/config/config.yaml b/config/config.yaml
@@ -14,4 +14,12 @@ samples: config/samples.csv
 #     markdup: "--remove-duplicates"
 #   picard:
 #     # Mapping QC optional parameters
-#     metrics: ""
+#     metrics: ""
+
+# Optional path to a `genome.csv` file
+# genomes: genome.csv
+
+# Internal use only, not described in documentation.
+# deactivate import of fair_genome_indexer pipeline.
+# Requires the file `genome.csv` to be filled.
+# load_fair_genome_indexer: true
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,6 +1,12 @@
 include: "rules/common.smk"
 include: "rules/fastp.smk"
-include: "rules/fair_genome_indexer_pipeline.smk"
+
+
+if config.get("load_fair_genome_indexer", True):
+
+    include: "rules/fair_genome_indexer_pipeline.smk"
+
+
 include: "rules/bowtie2_sambamba_meta.smk"
 include: "rules/picard_metrics.smk"
 include: "rules/multiqc.smk"

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -3,6 +3,7 @@ import pandas
 import snakemake
 import snakemake.utils
 
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 snakemake.utils.min_version("7.29.0")
@@ -110,6 +111,7 @@ def get_bowtie2_alignment_input(
     wildcards: snakemake.io.Wildcards,
     samples: pandas.DataFrame = samples,
     config: Dict[str, Any] = config,
+    genome: pandas.DataFrame = genomes,
 ) -> Dict[str, Union[Dict[str, str], str]]:
     """
     Return expected input files for Bowtie2 mapping, according to user-input,
@@ -131,8 +133,17 @@ def get_bowtie2_alignment_input(
     release: str = str(sample_data["release"])
     datatype: str = "dna"
 
-    idx: Optional[str] = config.get("resources", {}).get("bowtie2_index")
-    if not idx:
+    idx: Optional[str] = (
+        genomes.loc[
+            (genomes["species"] == species)
+            & (genomes["build"] == build)
+            & (genomes["release"] == release)
+        ]
+        .to_dict(orient="index")[0]
+        .get("bowtie2_index")
+    )
+
+    if idx is None or idx == "":
         idx = multiext(
             f"reference/{species}.{build}.{release}.{datatype}",
             ".1.bt2",
@@ -142,6 +153,8 @@ def get_bowtie2_alignment_input(
             ".rev.1.bt2",
             ".rev.2.bt2",
         )
+    else:
+        idx = [str(file) for file in Path(idx) if str(file).endswith(".bt2")]
 
     results: Dict[str, List[str]] = {
         "idx": idx,

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -10,6 +10,11 @@ genomes:
   description: Path to the CSV file defining genome properties
 
 
+load_fair_genome_indexer:
+  type: boolean
+  description: Load (or not) the fair_genome_indexer pipeline
+
+
 samples:
   type: string
   description: Path to the CSV file defining samples and their corresponding fastq paths