From 28dde621030813d8da67a6a767b81bb009b8bdb2 Mon Sep 17 00:00:00 2001 From: Kwee Boon Brandon Seah Date: Mon, 2 Jan 2023 15:51:17 +0100 Subject: [PATCH] Update Readme Document where to find pipelines to produce input files. --- .gitignore | 3 +++ README.md | 45 ++++++++++++++++++++++---------------------- run_snakemake.sh | 6 +++--- workflow/config.yaml | 8 ++++---- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 2a54c93..a313c51 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ data/* tmp/* envs/* +logs/* +.snakemake +falcon-comb* diff --git a/README.md b/README.md index a72128e..7220bc2 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,31 @@ -Template workflow folder for Snakemake pipeline -=============================================== +Pogigwasc gene prediction of Loxodes magnus genome +=================================================== -After cloning this repository, you should change the name of the folder as -appropriate, and update the remote URL of the repository to a new one for your -project. +Snakemake pipeline for gene prediction for Loxodes magnus, which has a genetic +code with context-dependent stop codons. Introns are first empirically +predicted with [Intronarrator](https://github.com/Swart-lab/Intronarrator) and +artifically removed to produce an "intronless" assembly, to run +[Pogigwasc](https://github.com/Swart-lab/pogigwasc) in `--no-intron` mode. This +is because the short lengths and unusual length distribution of introns in +Loxodes are difficult to model with the GHMM in Pogigwasc. +Data +---- -Suggested setup ---------------- +Pipeline and scripts to generate the genome assembly are available from +[loxodes-assembly-workflow](https://github.com/Swart-lab/loxodes-assembly-workflow) +repository. Pipeline for the "intronless" assembly is available from +[loxodes-intronarrator-workflow](https://github.com/Swart-lab/loxodes-intronarrator-workflow). -```bash -git clone git@github.com:Swart-lab/snakemake-template.git -mv snakemake-template my-project # rename project folder -cd my-project -mkdir data # folder to put project data, gitignored -mkdir envs # folder for Conda envs produced by workflow, gitignored -mkdir tmp # folder for temp files, gitignored -mkdir nb # folder for computational notebooks etc. -git remote remove origin # remove template repo as a remote -``` +This current pipeline was used for annotation of the MAC and MIC genomes; path +to reference assembly and names of output files were modified accordingly. -Edit the files `run_snakemake.sh` and/or `run_snakemake_sge.sh` to include -absolute paths to the working folder and to a Conda environment with Snakemake, -and modify other settings (e.g. max number of CPUs) as required. +Paths to input files in the `workflow/config.yaml` file are local paths used in +the original data analysis. When re-running the pipeline, replace these with +the actual paths on your system. -Snakemake rules and config files are in the `workflow/` subfolder. +Curated output from this annotation are included in the [archive of genome +annotations](https://doi.org/10.17617/3.9QTROS). Running workflow @@ -32,5 +33,3 @@ Running workflow To run on a local server, use `./run_snakemake.sh` script, and add rule names and additional parameters, e.g. `./run_snakemake.sh --dryrun`. - -[Documentation for `run_snakemake_sge.sh` TK] diff --git a/run_snakemake.sh b/run_snakemake.sh index 0896a9b..4f582f2 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -11,14 +11,14 @@ set -e # * Conda environments will be created in a subfolder `envs/` # PATHS -SNAKEMAKE_ENV= -WD= +SNAKEMAKE_ENV=/ebio/ag-swart/home/kbseah/anaconda3/envs/snakemake +WD=/ebio/abt2_projects/ag-swart-loxodes/annotation/falcon-comb_LmagMIC/pogigwasc_intronless # activate snakemake conda environment source activate $SNAKEMAKE_ENV snakemake \ ---cores 24 \ +--cores 16 \ --configfile $WD/workflow/config.yaml \ --use-conda \ --conda-frontend mamba \ diff --git a/workflow/config.yaml b/workflow/config.yaml index ee60f3b..ea8b36b 100644 --- a/workflow/config.yaml +++ b/workflow/config.yaml @@ -1,9 +1,9 @@ falcon-comb_LmagMIC: - ref_orig: + ref_orig: # Original reference genome assembly /ebio/abt2_projects/ag-swart-loxodes/assembly/falcon-comb_LmagMIC/scaffolds.fasta - ref_intronless_masked: + ref_intronless_masked: # Intronless assembly produced by Intronarrator /ebio/abt2_projects/ag-swart-loxodes/annotation/falcon-comb_LmagMIC/intronarrator/falcon-comb_LmagMIC.0.2.minus_introns.ncRNA_hard_masked.fa - realtrons_gff: + realtrons_gff: # Intron annotation GFF3 file produced by intronarrator /ebio/abt2_projects/ag-swart-loxodes/annotation/falcon-comb_LmagMIC/intronarrator/all.realtrons.0.2.noalt.gff - trf_min1000: + trf_min1000: # Low-complexity sequence annotation GFF3 /ebio/abt2_projects/ag-swart-loxodes/annotation/falcon-comb_LmagMIC/trf/falcon-comb_LmagMIC.trf.no_overlap.min1000.merge.bed