From 350f37e3c15e990666768726650d612e46c88103 Mon Sep 17 00:00:00 2001
From: Ali Hamraoui <hamraoui@bio.ens.psl.eu>
Date: Fri, 12 Jul 2024 15:13:37 +0200
Subject: [PATCH] UPDATE: update documentation web page

---
 docs/examples/index.html      |  82 +++++++++++++++++++++++++++++++++-
 docs/images/._schema.png      | Bin 4096 -> 4096 bytes
 docs/index.html               |   2 +-
 docs/introduction/index.html  |   4 ++
 docs/search/search_index.json |   2 +-
 docs/sitemap.xml.gz           | Bin 127 -> 127 bytes
 6 files changed, 87 insertions(+), 3 deletions(-)
diff --git a/docs/examples/index.html b/docs/examples/index.html
index a2ca2f9..41bc3a6 100644
--- a/docs/examples/index.html
+++ b/docs/examples/index.html
@@ -53,6 +53,24 @@
               </ul>
               <ul class="current">
                 <li class="toctree-l1 current"><a class="reference internal current" href="#">Use example</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#examples">EXAMPLES</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#sample-data">Sample data</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#basic-workflow">BASIC WORKFLOW</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#with-pcr-amplifiction">WITH PCR AMPLIFICTION</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#with-simulated-cell-type-counts">WITH SIMULATED CELL TYPE COUNTS</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#with-personalized-error-model">WITH PERSONALIZED ERROR MODEL</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#complete-workflow">COMPLETE WORKFLOW</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
                 </li>
               </ul>
       </div>
@@ -78,7 +96,69 @@
           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
             <div class="section" itemprop="articleBody">
               
-                
+                <h2 id="usage">Usage</h2>
+<p>User can choose among 4 ways to simulate template reads.
+- use a real count matrix
+- estimated the parameter from a real count matrix to simulate synthetic count matrix 
+- specified by his/her own the input parameter
+- a combination of the above options</p>
+<p>We use SPARSIM tools to simulate count matrix. for more information a bout synthetic count matrix, please read <a href="https://gitlab.com/sysbiobig/sparsim/-/blob/master/vignettes/sparsim.Rmd?ref_type=heads#Sec_Input_parameter_estimated_from_data">SPARSIM</a> documentaion.</p>
+<h3 id="examples">EXAMPLES</h3>
+<h5 id="sample-data">Sample data</h5>
+<p>A demonstration dataset to initiate this workflow is accessible on zenodo DOI : <a href="https://zenodo.org/records/12731409">10.5281/zenodo.12731408</a>. This dataset is a subsample from a Nanopore run of the <a href="https://www.10xgenomics.com/datasets/5k-human-pbmcs-3-v3-1-chromium-controller-3-1-standard">10X 5k human pbmcs</a>.</p>
+<p>The human GRCh38 <a href="https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/cdna/">reference transcriptome</a>, <a href="https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/">gtf annotation</a> and <a href="https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/">fasta referance genome</a> can be downloaded from Ensembl.</p>
+<h5 id="basic-workflow">BASIC WORKFLOW</h5>
+<pre><code class="language-bash"> nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/genes.gtf
+</code></pre>
+<h5 id="with-pcr-amplifiction">WITH PCR AMPLIFICTION</h5>
+<pre><code class="language-bash"> nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/GRCh38-2020-A-genes.gtf \
+                      --pcr_cycles 2 \
+                      --pcr_dup_rate 0.7 \
+                      --pcr_error_rate 0.00003
+</code></pre>
+<h5 id="with-simulated-cell-type-counts">WITH SIMULATED CELL TYPE COUNTS</h5>
+<pre><code class="language-bash"> nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/GRCh38-2020-A-genes.gtf \
+                      --sim_celltypes true \
+                      --cell_types_annotation dataset/sub_pbmc_cell_type.csv
+</code></pre>
+<h5 id="with-personalized-error-model">WITH PERSONALIZED ERROR MODEL</h5>
+<pre><code class="language-bash">nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                     --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                     --features gene_name \
+                     --gtf dataset/GRCh38-2020-A-genes.gtf \
+                     --build_model true \
+                     --fastq_model dataset/sub_pbmc_reads.fq \
+                     --ref_genome dataset/GRCh38-2020-A-genome.fa 
+</code></pre>
+<h5 id="complete-workflow">COMPLETE WORKFLOW</h5>
+<pre><code class="language-bash"> nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \
+                      --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \
+                      --features gene_name \
+                      --gtf dataset/GRCh38-2020-A-genes.gtf \
+                      --sim_celltypes true \
+                      --cell_types_annotation dataset/sub_pbmc_cell_type.csv
+                      --build_model true \
+                      --fastq_model dataset/sub_pbmc_reads.fq \
+                      --ref_genome dataset/GRCh38-2020-A-genome.fa 
+                      --pcr_cycles 2 \
+                      --pcr_dup_rate 0.7 \
+                      --pcr_error_rate 0.00003
+</code></pre>
+<h2 id="results">Results</h2>
+<p>After execution, results will be available in the specified <code>--outdir</code>. This includes simulated Nanopore reads <code>.fastq</code>, along with log files and QC report.</p>
+<h2 id="cleaning-up">Cleaning Up</h2>
+<p>To clean up temporary files generated by Nextflow:</p>
+<pre><code class="language-bash">nextflow clean -f
+</code></pre>
               
             </div>
           </div><footer>
diff --git a/docs/images/._schema.png b/docs/images/._schema.png
index 34d14118e6fb0bfe1e3d4fdfc490b8d9b5bff3dc..584864df3046162b0fce20d0f1c7fa7b1a2d8621 100644
GIT binary patch
delta 13
UcmZorXi%6C#UyfKW6TnM03b>P$p8QV

delta 13
UcmZorXi%6C#l)DuF=h!r03I6zT>t<8

diff --git a/docs/index.html b/docs/index.html
index a1ec6ce..02badc1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -171,5 +171,5 @@ <h2 id="run-asarusim">Run AsaruSim</h2>
 
 <!--
 MkDocs version : 1.6.0
-Build Date UTC : 2024-06-24 15:16:33.822496+00:00
+Build Date UTC : 2024-07-12 13:10:38.343577+00:00
 -->
diff --git a/docs/introduction/index.html b/docs/introduction/index.html
index 8a18d11..4e5a337 100644
--- a/docs/introduction/index.html
+++ b/docs/introduction/index.html
@@ -48,6 +48,8 @@
     </li>
     <li class="toctree-l2"><a class="reference internal" href="#citations">Citations</a>
     </li>
+    <li class="toctree-l2"><a class="reference internal" href="#acknowledgements">Acknowledgements</a>
+    </li>
     </ul>
                 </li>
               </ul>
@@ -102,6 +104,8 @@ <h3 id="license">License</h3>
 <p>AsaruSim is released under the <a href="https://github.com/alihamraoui/AsaruSim/blob/main/LICENSE">GPL 3.0 license</a>.</p>
 <h3 id="citations">Citations</h3>
 <p>If you use AsaruSim in your work, please cite us.</p>
+<h3 id="acknowledgements">Acknowledgements</h3>
+<p>We would like to express our gratitude to <a href="https://github.com/youyupei">Youyupei</a> for the development of <a href="https://github.com/youyupei/SLSim">SLSim</a>, which has been helpful to the AsaruSim workflow. Additionally, our thanks go to the teams behind <a href="https://github.com/rrwick/Badread">Badread</a> and <a href="https://gitlab.com/sysbiobig/sparsim">SPARSim</a>, whose tools are integral to the AsaruSim workflow.</p>
               
             </div>
           </div><footer>
diff --git a/docs/search/search_index.json b/docs/search/search_index.json
index 4f36cfc..21558e9 100644
--- a/docs/search/search_index.json
+++ b/docs/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Welcome to AsaruSim AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore reads. It allows to benchmark and optimize single-cell Nanopore long read data processing pipelines. Github repository AsaruSim . Requirements Nextflow - A workflow engine for complex data pipelines. Docker or Singularity - Containers for packaging necessary software, ensuring reproducibility. Install nextflow Ensure you have the following installed on your system: Java 8 or later : Nextflow requires Java. Install it from Oracle or use a package manager like apt, brew, or yum depending on your OS. curl -s https://get.nextflow.io | bash chmod +x nextflow sudo mv nextflow /usr/local/bin/ Nextflow is now installed and ready to use on your system. For further details on using Nextflow, refer to the official documentation . Install docker Install Docker . Run AsaruSim 1 - Clone the github repository git clone https://github.com/alihamraoui/AsaruSim.git cd AsaruSim 2 - Run using nextflow nextflow run main.nf --help","title":"Installation"},{"location":"#welcome-to-asarusim","text":"AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore reads. It allows to benchmark and optimize single-cell Nanopore long read data processing pipelines. Github repository AsaruSim .","title":"Welcome to AsaruSim"},{"location":"#requirements","text":"Nextflow - A workflow engine for complex data pipelines. Docker or Singularity - Containers for packaging necessary software, ensuring reproducibility.","title":"Requirements"},{"location":"#install-nextflow","text":"Ensure you have the following installed on your system: Java 8 or later : Nextflow requires Java. Install it from Oracle or use a package manager like apt, brew, or yum depending on your OS. curl -s https://get.nextflow.io | bash chmod +x nextflow sudo mv nextflow /usr/local/bin/ Nextflow is now installed and ready to use on your system. For further details on using Nextflow, refer to the official documentation .","title":"Install nextflow"},{"location":"#install-docker","text":"Install Docker .","title":"Install docker"},{"location":"#run-asarusim","text":"1 - Clone the github repository git clone https://github.com/alihamraoui/AsaruSim.git cd AsaruSim 2 - Run using nextflow nextflow run main.nf --help","title":"Run AsaruSim"},{"location":"examples/","text":"","title":"Use example"},{"location":"introduction/","text":"AsaruSim AsaruSim derives from the Amazigh word Asaru (\u2d30\u2d59\u2d30\u2d54\u2d53 ) which can mean \"pipeline\" or \"channel\". AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore data from the count matrix level to the sequence level. It aimed at creating a gold standard dataset for the assessment and optimization of single-cell long-read methods. Five major steps are implemented : (1) - Simulation of a synthetic UMI count matrix. (2) - Generation of perfect raw reads. (3) - Amplification of the perfect reads. (4) - Generation of realistic synthetic reads by adding errors to mimic real reads. (5) - Production of a report with quality control values and plots calculated on the resulting synthetic reads. Contributing Contributions are more than welcome. See the git repository page . License AsaruSim is released under the GPL 3.0 license . Citations If you use AsaruSim in your work, please cite us.","title":"Introduction"},{"location":"introduction/#asarusim","text":"AsaruSim derives from the Amazigh word Asaru (\u2d30\u2d59\u2d30\u2d54\u2d53 ) which can mean \"pipeline\" or \"channel\". AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore data from the count matrix level to the sequence level. It aimed at creating a gold standard dataset for the assessment and optimization of single-cell long-read methods. Five major steps are implemented : (1) - Simulation of a synthetic UMI count matrix. (2) - Generation of perfect raw reads. (3) - Amplification of the perfect reads. (4) - Generation of realistic synthetic reads by adding errors to mimic real reads. (5) - Production of a report with quality control values and plots calculated on the resulting synthetic reads.","title":"AsaruSim"},{"location":"introduction/#contributing","text":"Contributions are more than welcome. See the git repository page .","title":"Contributing"},{"location":"introduction/#license","text":"AsaruSim is released under the GPL 3.0 license .","title":"License"},{"location":"introduction/#citations","text":"If you use AsaruSim in your work, please cite us.","title":"Citations"},{"location":"parameters/","text":"Input parameters AsaruSim simulation requires an input parameter to work, describing the macro characteristics of the desired synthetic reads. Count matrix --matrix The --matrix parameter is a feature-by-cell (gene/cell or isoform/cell) count table (.CSV) file where rows represent the features of interest (genes or transcripts) and columns represent cells (or spatial barcodes) . The input matrix may be derived from an existing single-cell short- or long-read preprocessed run. this parameter is required . AsaruSim require a feature name and GTF annotation for gene-per-cell matrix Since the sequence names in the reference transcriptome correspond to transcript ids , it is necessary for users to specify the feature name within their matrix when gene-per-cell matrix is provided. Set the feature name using the --features parameter. Available options include: transcript_id (default) gene_id gene_name Additionally, users are required to supply a gene annotation (.gtf format) file using the --gtf parameter. --bc_counts To simulate specific UMI counts per cell barcode, set the --bc_counts parameter to the path of a UMI counts .CSV file . This parameter eliminates the need for an input matrix, enabling the simulation of UMI counts where transcripts are chosed randomly. CB counts ACGGCGATCGCGAGCC 1260 ACGGCGATCGCGAGCC 1104 --cell_types_annotation AsaruSim can generate synthetic count tables with varying cell types , differentially expressed genes , or isoforms . This capability is particularly useful for simulating count matrices that mimic the characteristics of existing cell populations. To simulate cell groups, AsaruSim requirs suplimentary parameters describing the characteristic of desired cell groups. Requirements for simulating cell groups : AsaruSim use SPARSim R package to simulate synthetic count table . for each cell group to simulate, SPARSim needs 3 information as input: expression level intensities . expression level variabilities . cell group library sizes . fore more information see SPARSim vignettes . AsaruSim allows user to estimate this characteristic from an existing count table. To do so, the user need to set --sim_celltypes parameter to true and to provide the list of cell barcodes of each group (.CSV file) using --cell_types_annotation parameter: CB cell_type ACGGCGATCGCGAGCC type 1 ACGGCGATCGCGAGCC type 2 AsaruSim will then use the provided matrix to estimate characteristic of each cell groups and generate a synthetic count matrix. Template AsaruSim generates reads that correspond to a 10X Genomics library construction coupled with Nanopore sequencing. The final construction corresponds to : an adaptor sequence composed of 10X and Nanopore adaptors, a cellular barcode (CB), UMI sequences at the same frequencies as in the synthetic count matrix, a 20 bp oligo(dT) , the feature-corresponding cDNA sequence from the reference transcriptome and a template switch oligo (TSO) at the end. --dT_LENGTH Specifies the length of the oligo(dT) tail sequence. Default: 20 bp. --ADAPTER_SEQ Defines the sequence of the adapter used in the ONT and 10X Genomics libraries. By default, the 10X 3' solution V3 adapter sequence is used Default: ACTAAAGGCCATTACGGCCTACACGACGCTCTTCCGATCT . --TSO_SEQ Specifies the sequence of the Template Switching Oligo (TSO) nucleotid. Default: TGTACTCTGCGTTGATACCACTGCTT . Reference transcriptome The feature-corresponding cDNA sequence is sampled from the reference transcriptome. --transcriptome A reference transcriptome file in .fasta format can be downloaded from Ensembl . --length_dist To mimic the real read length distribution when a gene expression matrix is provided, a realistic read length distribution is achieved by selecting a random cDNA of the corresponding gene, with a prior probability favoring short-length cDNA. AsaruSim estimates this read length using a log-normal distribution . Users may provide their parameters to personalize the distribution using three comma-delimited values (shape, location, scale) with the parameter --length_dist . ( default : 0.37,0.0,825 ) Shape (\u03c3) : The standard deviation of the log values. Location (\u03bc) : The location parameter using the basic form of the log-normal distribution. Scale : The scale factor (the median of your distribution). Fit read distribution of real reads Users may also fit their real reads distribution with this approach by providing a subset of real reads (in .FASTQ format) using the --model_fastq parameter. (See also build model in the Error model section) PCR amplification AsaruSim take into account the bias of PCR amplification introduced during library constructions process. The PCR amplification is simulated by replicating the synthetic reads at each cycle, with a capturing probabily and un error rate. --PCR_cycles The number of PCR cycles to simulate. During each cycle, the reads are duplicated exponentially, following the formula: $$\\ N = N_0 \\times (1 + E)^{C} \\ $$ where: N is the final number of reads, N0 is the initial number of reads, E is the efficiency rate, and C is the number of cycles. --PCR_efficiency The efficiency rate of duplication is fixed by the user ( default : --PCR_efficiency 0.9 ) --PCR_error_rate The probability to be mutated during the process for each nucleotide in the duplicated read. The error rate is also fixed by the user ( default : --PCR_error_rate 3.5e-05 ) --total_reads Number of total reads to random subset from the resulting artificial PCR product, to mimic the experimental protocol where only a subset of the sample is used for the sequencing step. Users can use amplification rate instead of PCR amplification Inspired by SLSim, the amplification rate allows users to repeat each template read a specified number of times. This is a simpler way to simulate amplification with: $$\\ x \\sim \\text{Poi}(\\text{amp_rate}) \\ $$ The value of x is set by the user using --amp_rate Error model AsaruSim uses the Badread Python library to simulate nanopore sequencing errors and assign per-base quality scores based on pre-trained error models (see Badread documentation for more information). To do so, AsaruSim requires: --trained_model This allows the user to choose one of the built-in error models within the Badread database. The possible values are: nanopore2023 : a model trained on ONT R10.4.1 reads from 2023 (the default). nanopore2020 : a model trained on ONT R9.4.1 reads from 2020. nanopore2018 : a model trained on ONT R9.4/R9.4.1 reads from 2018. random : a random error model with a 1/3 chance each of insertion, deletion, and substitution. a file path for a trained model. --badread_identity Badread uses the Beta distribution to sample read identities. The distribution is defined with three parameters: mean , standard deviation , and maximum value. To pass these parameters to AsaruSim, use three comma-delimited values (identity mean, max, stdev). default : --badread_identity 95,99,2.5 . --build_model To internally train a personalized read identity , Qscore , and error models, AsaruSim requires a real FASTQ read file that can be provided using --model_fastq and a reference genome (.FASTA) file using --ref_genome . AsaruSim also accepts pre-built model files Users can use --error_model and --qscore_model to provide Badread pre-built models in file format.","title":"Input parameters"},{"location":"parameters/#input-parameters","text":"AsaruSim simulation requires an input parameter to work, describing the macro characteristics of the desired synthetic reads.","title":"Input parameters"},{"location":"parameters/#count-matrix","text":"","title":"Count matrix"},{"location":"parameters/#-matrix","text":"The --matrix parameter is a feature-by-cell (gene/cell or isoform/cell) count table (.CSV) file where rows represent the features of interest (genes or transcripts) and columns represent cells (or spatial barcodes) . The input matrix may be derived from an existing single-cell short- or long-read preprocessed run. this parameter is required . AsaruSim require a feature name and GTF annotation for gene-per-cell matrix Since the sequence names in the reference transcriptome correspond to transcript ids , it is necessary for users to specify the feature name within their matrix when gene-per-cell matrix is provided. Set the feature name using the --features parameter. Available options include: transcript_id (default) gene_id gene_name Additionally, users are required to supply a gene annotation (.gtf format) file using the --gtf parameter.","title":"--matrix"},{"location":"parameters/#-bc_counts","text":"To simulate specific UMI counts per cell barcode, set the --bc_counts parameter to the path of a UMI counts .CSV file . This parameter eliminates the need for an input matrix, enabling the simulation of UMI counts where transcripts are chosed randomly. CB counts ACGGCGATCGCGAGCC 1260 ACGGCGATCGCGAGCC 1104","title":"--bc_counts"},{"location":"parameters/#-cell_types_annotation","text":"AsaruSim can generate synthetic count tables with varying cell types , differentially expressed genes , or isoforms . This capability is particularly useful for simulating count matrices that mimic the characteristics of existing cell populations. To simulate cell groups, AsaruSim requirs suplimentary parameters describing the characteristic of desired cell groups. Requirements for simulating cell groups : AsaruSim use SPARSim R package to simulate synthetic count table . for each cell group to simulate, SPARSim needs 3 information as input: expression level intensities . expression level variabilities . cell group library sizes . fore more information see SPARSim vignettes . AsaruSim allows user to estimate this characteristic from an existing count table. To do so, the user need to set --sim_celltypes parameter to true and to provide the list of cell barcodes of each group (.CSV file) using --cell_types_annotation parameter: CB cell_type ACGGCGATCGCGAGCC type 1 ACGGCGATCGCGAGCC type 2 AsaruSim will then use the provided matrix to estimate characteristic of each cell groups and generate a synthetic count matrix.","title":"--cell_types_annotation"},{"location":"parameters/#template","text":"AsaruSim generates reads that correspond to a 10X Genomics library construction coupled with Nanopore sequencing. The final construction corresponds to : an adaptor sequence composed of 10X and Nanopore adaptors, a cellular barcode (CB), UMI sequences at the same frequencies as in the synthetic count matrix, a 20 bp oligo(dT) , the feature-corresponding cDNA sequence from the reference transcriptome and a template switch oligo (TSO) at the end.","title":"Template"},{"location":"parameters/#-dt_length","text":"Specifies the length of the oligo(dT) tail sequence. Default: 20 bp.","title":"--dT_LENGTH"},{"location":"parameters/#-adapter_seq","text":"Defines the sequence of the adapter used in the ONT and 10X Genomics libraries. By default, the 10X 3' solution V3 adapter sequence is used Default: ACTAAAGGCCATTACGGCCTACACGACGCTCTTCCGATCT .","title":"--ADAPTER_SEQ"},{"location":"parameters/#-tso_seq","text":"Specifies the sequence of the Template Switching Oligo (TSO) nucleotid. Default: TGTACTCTGCGTTGATACCACTGCTT .","title":"--TSO_SEQ"},{"location":"parameters/#reference-transcriptome","text":"The feature-corresponding cDNA sequence is sampled from the reference transcriptome.","title":"Reference transcriptome"},{"location":"parameters/#-transcriptome","text":"A reference transcriptome file in .fasta format can be downloaded from Ensembl .","title":"--transcriptome"},{"location":"parameters/#-length_dist","text":"To mimic the real read length distribution when a gene expression matrix is provided, a realistic read length distribution is achieved by selecting a random cDNA of the corresponding gene, with a prior probability favoring short-length cDNA. AsaruSim estimates this read length using a log-normal distribution . Users may provide their parameters to personalize the distribution using three comma-delimited values (shape, location, scale) with the parameter --length_dist . ( default : 0.37,0.0,825 ) Shape (\u03c3) : The standard deviation of the log values. Location (\u03bc) : The location parameter using the basic form of the log-normal distribution. Scale : The scale factor (the median of your distribution). Fit read distribution of real reads Users may also fit their real reads distribution with this approach by providing a subset of real reads (in .FASTQ format) using the --model_fastq parameter. (See also build model in the Error model section)","title":"--length_dist"},{"location":"parameters/#pcr-amplification","text":"AsaruSim take into account the bias of PCR amplification introduced during library constructions process. The PCR amplification is simulated by replicating the synthetic reads at each cycle, with a capturing probabily and un error rate.","title":"PCR amplification"},{"location":"parameters/#-pcr_cycles","text":"The number of PCR cycles to simulate. During each cycle, the reads are duplicated exponentially, following the formula: $$\\ N = N_0 \\times (1 + E)^{C} \\ $$ where: N is the final number of reads, N0 is the initial number of reads, E is the efficiency rate, and C is the number of cycles.","title":"--PCR_cycles"},{"location":"parameters/#-pcr_efficiency","text":"The efficiency rate of duplication is fixed by the user ( default : --PCR_efficiency 0.9 )","title":"--PCR_efficiency"},{"location":"parameters/#-pcr_error_rate","text":"The probability to be mutated during the process for each nucleotide in the duplicated read. The error rate is also fixed by the user ( default : --PCR_error_rate 3.5e-05 )","title":"--PCR_error_rate"},{"location":"parameters/#-total_reads","text":"Number of total reads to random subset from the resulting artificial PCR product, to mimic the experimental protocol where only a subset of the sample is used for the sequencing step. Users can use amplification rate instead of PCR amplification Inspired by SLSim, the amplification rate allows users to repeat each template read a specified number of times. This is a simpler way to simulate amplification with: $$\\ x \\sim \\text{Poi}(\\text{amp_rate}) \\ $$ The value of x is set by the user using --amp_rate","title":"--total_reads"},{"location":"parameters/#error-model","text":"AsaruSim uses the Badread Python library to simulate nanopore sequencing errors and assign per-base quality scores based on pre-trained error models (see Badread documentation for more information). To do so, AsaruSim requires:","title":"Error model"},{"location":"parameters/#-trained_model","text":"This allows the user to choose one of the built-in error models within the Badread database. The possible values are: nanopore2023 : a model trained on ONT R10.4.1 reads from 2023 (the default). nanopore2020 : a model trained on ONT R9.4.1 reads from 2020. nanopore2018 : a model trained on ONT R9.4/R9.4.1 reads from 2018. random : a random error model with a 1/3 chance each of insertion, deletion, and substitution. a file path for a trained model.","title":"--trained_model"},{"location":"parameters/#-badread_identity","text":"Badread uses the Beta distribution to sample read identities. The distribution is defined with three parameters: mean , standard deviation , and maximum value. To pass these parameters to AsaruSim, use three comma-delimited values (identity mean, max, stdev). default : --badread_identity 95,99,2.5 .","title":"--badread_identity"},{"location":"parameters/#-build_model","text":"To internally train a personalized read identity , Qscore , and error models, AsaruSim requires a real FASTQ read file that can be provided using --model_fastq and a reference genome (.FASTA) file using --ref_genome . AsaruSim also accepts pre-built model files Users can use --error_model and --qscore_model to provide Badread pre-built models in file format.","title":"--build_model"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Welcome to AsaruSim AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore reads. It allows to benchmark and optimize single-cell Nanopore long read data processing pipelines. Github repository AsaruSim . Requirements Nextflow - A workflow engine for complex data pipelines. Docker or Singularity - Containers for packaging necessary software, ensuring reproducibility. Install nextflow Ensure you have the following installed on your system: Java 8 or later : Nextflow requires Java. Install it from Oracle or use a package manager like apt, brew, or yum depending on your OS. curl -s https://get.nextflow.io | bash chmod +x nextflow sudo mv nextflow /usr/local/bin/ Nextflow is now installed and ready to use on your system. For further details on using Nextflow, refer to the official documentation . Install docker Install Docker . Run AsaruSim 1 - Clone the github repository git clone https://github.com/alihamraoui/AsaruSim.git cd AsaruSim 2 - Run using nextflow nextflow run main.nf --help","title":"Installation"},{"location":"#welcome-to-asarusim","text":"AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore reads. It allows to benchmark and optimize single-cell Nanopore long read data processing pipelines. Github repository AsaruSim .","title":"Welcome to AsaruSim"},{"location":"#requirements","text":"Nextflow - A workflow engine for complex data pipelines. Docker or Singularity - Containers for packaging necessary software, ensuring reproducibility.","title":"Requirements"},{"location":"#install-nextflow","text":"Ensure you have the following installed on your system: Java 8 or later : Nextflow requires Java. Install it from Oracle or use a package manager like apt, brew, or yum depending on your OS. curl -s https://get.nextflow.io | bash chmod +x nextflow sudo mv nextflow /usr/local/bin/ Nextflow is now installed and ready to use on your system. For further details on using Nextflow, refer to the official documentation .","title":"Install nextflow"},{"location":"#install-docker","text":"Install Docker .","title":"Install docker"},{"location":"#run-asarusim","text":"1 - Clone the github repository git clone https://github.com/alihamraoui/AsaruSim.git cd AsaruSim 2 - Run using nextflow nextflow run main.nf --help","title":"Run AsaruSim"},{"location":"examples/","text":"Usage User can choose among 4 ways to simulate template reads. - use a real count matrix - estimated the parameter from a real count matrix to simulate synthetic count matrix - specified by his/her own the input parameter - a combination of the above options We use SPARSIM tools to simulate count matrix. for more information a bout synthetic count matrix, please read SPARSIM documentaion. EXAMPLES Sample data A demonstration dataset to initiate this workflow is accessible on zenodo DOI : 10.5281/zenodo.12731408 . This dataset is a subsample from a Nanopore run of the 10X 5k human pbmcs . The human GRCh38 reference transcriptome , gtf annotation and fasta referance genome can be downloaded from Ensembl. BASIC WORKFLOW nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/genes.gtf WITH PCR AMPLIFICTION nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --pcr_cycles 2 \\ --pcr_dup_rate 0.7 \\ --pcr_error_rate 0.00003 WITH SIMULATED CELL TYPE COUNTS nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --sim_celltypes true \\ --cell_types_annotation dataset/sub_pbmc_cell_type.csv WITH PERSONALIZED ERROR MODEL nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --build_model true \\ --fastq_model dataset/sub_pbmc_reads.fq \\ --ref_genome dataset/GRCh38-2020-A-genome.fa COMPLETE WORKFLOW nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --sim_celltypes true \\ --cell_types_annotation dataset/sub_pbmc_cell_type.csv --build_model true \\ --fastq_model dataset/sub_pbmc_reads.fq \\ --ref_genome dataset/GRCh38-2020-A-genome.fa --pcr_cycles 2 \\ --pcr_dup_rate 0.7 \\ --pcr_error_rate 0.00003 Results After execution, results will be available in the specified --outdir . This includes simulated Nanopore reads .fastq , along with log files and QC report. Cleaning Up To clean up temporary files generated by Nextflow: nextflow clean -f","title":"Use example"},{"location":"examples/#usage","text":"User can choose among 4 ways to simulate template reads. - use a real count matrix - estimated the parameter from a real count matrix to simulate synthetic count matrix - specified by his/her own the input parameter - a combination of the above options We use SPARSIM tools to simulate count matrix. for more information a bout synthetic count matrix, please read SPARSIM documentaion.","title":"Usage"},{"location":"examples/#examples","text":"","title":"EXAMPLES"},{"location":"examples/#sample-data","text":"A demonstration dataset to initiate this workflow is accessible on zenodo DOI : 10.5281/zenodo.12731408 . This dataset is a subsample from a Nanopore run of the 10X 5k human pbmcs . The human GRCh38 reference transcriptome , gtf annotation and fasta referance genome can be downloaded from Ensembl.","title":"Sample data"},{"location":"examples/#basic-workflow","text":"nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/genes.gtf","title":"BASIC WORKFLOW"},{"location":"examples/#with-pcr-amplifiction","text":"nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --pcr_cycles 2 \\ --pcr_dup_rate 0.7 \\ --pcr_error_rate 0.00003","title":"WITH PCR AMPLIFICTION"},{"location":"examples/#with-simulated-cell-type-counts","text":"nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --sim_celltypes true \\ --cell_types_annotation dataset/sub_pbmc_cell_type.csv","title":"WITH SIMULATED CELL TYPE COUNTS"},{"location":"examples/#with-personalized-error-model","text":"nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --build_model true \\ --fastq_model dataset/sub_pbmc_reads.fq \\ --ref_genome dataset/GRCh38-2020-A-genome.fa","title":"WITH PERSONALIZED ERROR MODEL"},{"location":"examples/#complete-workflow","text":"nextflow run main.nf --matrix dataset/sub_pbmc_matrice.csv \\ --transcriptome dataset/Homo_sapiens.GRCh38.cdna.all.fa \\ --features gene_name \\ --gtf dataset/GRCh38-2020-A-genes.gtf \\ --sim_celltypes true \\ --cell_types_annotation dataset/sub_pbmc_cell_type.csv --build_model true \\ --fastq_model dataset/sub_pbmc_reads.fq \\ --ref_genome dataset/GRCh38-2020-A-genome.fa --pcr_cycles 2 \\ --pcr_dup_rate 0.7 \\ --pcr_error_rate 0.00003","title":"COMPLETE WORKFLOW"},{"location":"examples/#results","text":"After execution, results will be available in the specified --outdir . This includes simulated Nanopore reads .fastq , along with log files and QC report.","title":"Results"},{"location":"examples/#cleaning-up","text":"To clean up temporary files generated by Nextflow: nextflow clean -f","title":"Cleaning Up"},{"location":"introduction/","text":"AsaruSim AsaruSim derives from the Amazigh word Asaru (\u2d30\u2d59\u2d30\u2d54\u2d53 ) which can mean \"pipeline\" or \"channel\". AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore data from the count matrix level to the sequence level. It aimed at creating a gold standard dataset for the assessment and optimization of single-cell long-read methods. Five major steps are implemented : (1) - Simulation of a synthetic UMI count matrix. (2) - Generation of perfect raw reads. (3) - Amplification of the perfect reads. (4) - Generation of realistic synthetic reads by adding errors to mimic real reads. (5) - Production of a report with quality control values and plots calculated on the resulting synthetic reads. Contributing Contributions are more than welcome. See the git repository page . License AsaruSim is released under the GPL 3.0 license . Citations If you use AsaruSim in your work, please cite us. Acknowledgements We would like to express our gratitude to Youyupei for the development of SLSim , which has been helpful to the AsaruSim workflow. Additionally, our thanks go to the teams behind Badread and SPARSim , whose tools are integral to the AsaruSim workflow.","title":"Introduction"},{"location":"introduction/#asarusim","text":"AsaruSim derives from the Amazigh word Asaru (\u2d30\u2d59\u2d30\u2d54\u2d53 ) which can mean \"pipeline\" or \"channel\". AsaruSim is an automated Nextflow workflow designed for simulating 10x single-cell Nanopore data from the count matrix level to the sequence level. It aimed at creating a gold standard dataset for the assessment and optimization of single-cell long-read methods. Five major steps are implemented : (1) - Simulation of a synthetic UMI count matrix. (2) - Generation of perfect raw reads. (3) - Amplification of the perfect reads. (4) - Generation of realistic synthetic reads by adding errors to mimic real reads. (5) - Production of a report with quality control values and plots calculated on the resulting synthetic reads.","title":"AsaruSim"},{"location":"introduction/#contributing","text":"Contributions are more than welcome. See the git repository page .","title":"Contributing"},{"location":"introduction/#license","text":"AsaruSim is released under the GPL 3.0 license .","title":"License"},{"location":"introduction/#citations","text":"If you use AsaruSim in your work, please cite us.","title":"Citations"},{"location":"introduction/#acknowledgements","text":"We would like to express our gratitude to Youyupei for the development of SLSim , which has been helpful to the AsaruSim workflow. Additionally, our thanks go to the teams behind Badread and SPARSim , whose tools are integral to the AsaruSim workflow.","title":"Acknowledgements"},{"location":"parameters/","text":"Input parameters AsaruSim simulation requires an input parameter to work, describing the macro characteristics of the desired synthetic reads. Count matrix --matrix The --matrix parameter is a feature-by-cell (gene/cell or isoform/cell) count table (.CSV) file where rows represent the features of interest (genes or transcripts) and columns represent cells (or spatial barcodes) . The input matrix may be derived from an existing single-cell short- or long-read preprocessed run. this parameter is required . AsaruSim require a feature name and GTF annotation for gene-per-cell matrix Since the sequence names in the reference transcriptome correspond to transcript ids , it is necessary for users to specify the feature name within their matrix when gene-per-cell matrix is provided. Set the feature name using the --features parameter. Available options include: transcript_id (default) gene_id gene_name Additionally, users are required to supply a gene annotation (.gtf format) file using the --gtf parameter. --bc_counts To simulate specific UMI counts per cell barcode, set the --bc_counts parameter to the path of a UMI counts .CSV file . This parameter eliminates the need for an input matrix, enabling the simulation of UMI counts where transcripts are chosed randomly. CB counts ACGGCGATCGCGAGCC 1260 ACGGCGATCGCGAGCC 1104 --cell_types_annotation AsaruSim can generate synthetic count tables with varying cell types , differentially expressed genes , or isoforms . This capability is particularly useful for simulating count matrices that mimic the characteristics of existing cell populations. To simulate cell groups, AsaruSim requirs suplimentary parameters describing the characteristic of desired cell groups. Requirements for simulating cell groups : AsaruSim use SPARSim R package to simulate synthetic count table . for each cell group to simulate, SPARSim needs 3 information as input: expression level intensities . expression level variabilities . cell group library sizes . fore more information see SPARSim vignettes . AsaruSim allows user to estimate this characteristic from an existing count table. To do so, the user need to set --sim_celltypes parameter to true and to provide the list of cell barcodes of each group (.CSV file) using --cell_types_annotation parameter: CB cell_type ACGGCGATCGCGAGCC type 1 ACGGCGATCGCGAGCC type 2 AsaruSim will then use the provided matrix to estimate characteristic of each cell groups and generate a synthetic count matrix. Template AsaruSim generates reads that correspond to a 10X Genomics library construction coupled with Nanopore sequencing. The final construction corresponds to : an adaptor sequence composed of 10X and Nanopore adaptors, a cellular barcode (CB), UMI sequences at the same frequencies as in the synthetic count matrix, a 20 bp oligo(dT) , the feature-corresponding cDNA sequence from the reference transcriptome and a template switch oligo (TSO) at the end. --dT_LENGTH Specifies the length of the oligo(dT) tail sequence. Default: 20 bp. --ADAPTER_SEQ Defines the sequence of the adapter used in the ONT and 10X Genomics libraries. By default, the 10X 3' solution V3 adapter sequence is used Default: ACTAAAGGCCATTACGGCCTACACGACGCTCTTCCGATCT . --TSO_SEQ Specifies the sequence of the Template Switching Oligo (TSO) nucleotid. Default: TGTACTCTGCGTTGATACCACTGCTT . Reference transcriptome The feature-corresponding cDNA sequence is sampled from the reference transcriptome. --transcriptome A reference transcriptome file in .fasta format can be downloaded from Ensembl . --length_dist To mimic the real read length distribution when a gene expression matrix is provided, a realistic read length distribution is achieved by selecting a random cDNA of the corresponding gene, with a prior probability favoring short-length cDNA. AsaruSim estimates this read length using a log-normal distribution . Users may provide their parameters to personalize the distribution using three comma-delimited values (shape, location, scale) with the parameter --length_dist . ( default : 0.37,0.0,825 ) Shape (\u03c3) : The standard deviation of the log values. Location (\u03bc) : The location parameter using the basic form of the log-normal distribution. Scale : The scale factor (the median of your distribution). Fit read distribution of real reads Users may also fit their real reads distribution with this approach by providing a subset of real reads (in .FASTQ format) using the --model_fastq parameter. (See also build model in the Error model section) PCR amplification AsaruSim take into account the bias of PCR amplification introduced during library constructions process. The PCR amplification is simulated by replicating the synthetic reads at each cycle, with a capturing probabily and un error rate. --PCR_cycles The number of PCR cycles to simulate. During each cycle, the reads are duplicated exponentially, following the formula: $$\\ N = N_0 \\times (1 + E)^{C} \\ $$ where: N is the final number of reads, N0 is the initial number of reads, E is the efficiency rate, and C is the number of cycles. --PCR_efficiency The efficiency rate of duplication is fixed by the user ( default : --PCR_efficiency 0.9 ) --PCR_error_rate The probability to be mutated during the process for each nucleotide in the duplicated read. The error rate is also fixed by the user ( default : --PCR_error_rate 3.5e-05 ) --total_reads Number of total reads to random subset from the resulting artificial PCR product, to mimic the experimental protocol where only a subset of the sample is used for the sequencing step. Users can use amplification rate instead of PCR amplification Inspired by SLSim, the amplification rate allows users to repeat each template read a specified number of times. This is a simpler way to simulate amplification with: $$\\ x \\sim \\text{Poi}(\\text{amp_rate}) \\ $$ The value of x is set by the user using --amp_rate Error model AsaruSim uses the Badread Python library to simulate nanopore sequencing errors and assign per-base quality scores based on pre-trained error models (see Badread documentation for more information). To do so, AsaruSim requires: --trained_model This allows the user to choose one of the built-in error models within the Badread database. The possible values are: nanopore2023 : a model trained on ONT R10.4.1 reads from 2023 (the default). nanopore2020 : a model trained on ONT R9.4.1 reads from 2020. nanopore2018 : a model trained on ONT R9.4/R9.4.1 reads from 2018. random : a random error model with a 1/3 chance each of insertion, deletion, and substitution. a file path for a trained model. --badread_identity Badread uses the Beta distribution to sample read identities. The distribution is defined with three parameters: mean , standard deviation , and maximum value. To pass these parameters to AsaruSim, use three comma-delimited values (identity mean, max, stdev). default : --badread_identity 95,99,2.5 . --build_model To internally train a personalized read identity , Qscore , and error models, AsaruSim requires a real FASTQ read file that can be provided using --model_fastq and a reference genome (.FASTA) file using --ref_genome . AsaruSim also accepts pre-built model files Users can use --error_model and --qscore_model to provide Badread pre-built models in file format.","title":"Input parameters"},{"location":"parameters/#input-parameters","text":"AsaruSim simulation requires an input parameter to work, describing the macro characteristics of the desired synthetic reads.","title":"Input parameters"},{"location":"parameters/#count-matrix","text":"","title":"Count matrix"},{"location":"parameters/#-matrix","text":"The --matrix parameter is a feature-by-cell (gene/cell or isoform/cell) count table (.CSV) file where rows represent the features of interest (genes or transcripts) and columns represent cells (or spatial barcodes) . The input matrix may be derived from an existing single-cell short- or long-read preprocessed run. this parameter is required . AsaruSim require a feature name and GTF annotation for gene-per-cell matrix Since the sequence names in the reference transcriptome correspond to transcript ids , it is necessary for users to specify the feature name within their matrix when gene-per-cell matrix is provided. Set the feature name using the --features parameter. Available options include: transcript_id (default) gene_id gene_name Additionally, users are required to supply a gene annotation (.gtf format) file using the --gtf parameter.","title":"--matrix"},{"location":"parameters/#-bc_counts","text":"To simulate specific UMI counts per cell barcode, set the --bc_counts parameter to the path of a UMI counts .CSV file . This parameter eliminates the need for an input matrix, enabling the simulation of UMI counts where transcripts are chosed randomly. CB counts ACGGCGATCGCGAGCC 1260 ACGGCGATCGCGAGCC 1104","title":"--bc_counts"},{"location":"parameters/#-cell_types_annotation","text":"AsaruSim can generate synthetic count tables with varying cell types , differentially expressed genes , or isoforms . This capability is particularly useful for simulating count matrices that mimic the characteristics of existing cell populations. To simulate cell groups, AsaruSim requirs suplimentary parameters describing the characteristic of desired cell groups. Requirements for simulating cell groups : AsaruSim use SPARSim R package to simulate synthetic count table . for each cell group to simulate, SPARSim needs 3 information as input: expression level intensities . expression level variabilities . cell group library sizes . fore more information see SPARSim vignettes . AsaruSim allows user to estimate this characteristic from an existing count table. To do so, the user need to set --sim_celltypes parameter to true and to provide the list of cell barcodes of each group (.CSV file) using --cell_types_annotation parameter: CB cell_type ACGGCGATCGCGAGCC type 1 ACGGCGATCGCGAGCC type 2 AsaruSim will then use the provided matrix to estimate characteristic of each cell groups and generate a synthetic count matrix.","title":"--cell_types_annotation"},{"location":"parameters/#template","text":"AsaruSim generates reads that correspond to a 10X Genomics library construction coupled with Nanopore sequencing. The final construction corresponds to : an adaptor sequence composed of 10X and Nanopore adaptors, a cellular barcode (CB), UMI sequences at the same frequencies as in the synthetic count matrix, a 20 bp oligo(dT) , the feature-corresponding cDNA sequence from the reference transcriptome and a template switch oligo (TSO) at the end.","title":"Template"},{"location":"parameters/#-dt_length","text":"Specifies the length of the oligo(dT) tail sequence. Default: 20 bp.","title":"--dT_LENGTH"},{"location":"parameters/#-adapter_seq","text":"Defines the sequence of the adapter used in the ONT and 10X Genomics libraries. By default, the 10X 3' solution V3 adapter sequence is used Default: ACTAAAGGCCATTACGGCCTACACGACGCTCTTCCGATCT .","title":"--ADAPTER_SEQ"},{"location":"parameters/#-tso_seq","text":"Specifies the sequence of the Template Switching Oligo (TSO) nucleotid. Default: TGTACTCTGCGTTGATACCACTGCTT .","title":"--TSO_SEQ"},{"location":"parameters/#reference-transcriptome","text":"The feature-corresponding cDNA sequence is sampled from the reference transcriptome.","title":"Reference transcriptome"},{"location":"parameters/#-transcriptome","text":"A reference transcriptome file in .fasta format can be downloaded from Ensembl .","title":"--transcriptome"},{"location":"parameters/#-length_dist","text":"To mimic the real read length distribution when a gene expression matrix is provided, a realistic read length distribution is achieved by selecting a random cDNA of the corresponding gene, with a prior probability favoring short-length cDNA. AsaruSim estimates this read length using a log-normal distribution . Users may provide their parameters to personalize the distribution using three comma-delimited values (shape, location, scale) with the parameter --length_dist . ( default : 0.37,0.0,825 ) Shape (\u03c3) : The standard deviation of the log values. Location (\u03bc) : The location parameter using the basic form of the log-normal distribution. Scale : The scale factor (the median of your distribution). Fit read distribution of real reads Users may also fit their real reads distribution with this approach by providing a subset of real reads (in .FASTQ format) using the --model_fastq parameter. (See also build model in the Error model section)","title":"--length_dist"},{"location":"parameters/#pcr-amplification","text":"AsaruSim take into account the bias of PCR amplification introduced during library constructions process. The PCR amplification is simulated by replicating the synthetic reads at each cycle, with a capturing probabily and un error rate.","title":"PCR amplification"},{"location":"parameters/#-pcr_cycles","text":"The number of PCR cycles to simulate. During each cycle, the reads are duplicated exponentially, following the formula: $$\\ N = N_0 \\times (1 + E)^{C} \\ $$ where: N is the final number of reads, N0 is the initial number of reads, E is the efficiency rate, and C is the number of cycles.","title":"--PCR_cycles"},{"location":"parameters/#-pcr_efficiency","text":"The efficiency rate of duplication is fixed by the user ( default : --PCR_efficiency 0.9 )","title":"--PCR_efficiency"},{"location":"parameters/#-pcr_error_rate","text":"The probability to be mutated during the process for each nucleotide in the duplicated read. The error rate is also fixed by the user ( default : --PCR_error_rate 3.5e-05 )","title":"--PCR_error_rate"},{"location":"parameters/#-total_reads","text":"Number of total reads to random subset from the resulting artificial PCR product, to mimic the experimental protocol where only a subset of the sample is used for the sequencing step. Users can use amplification rate instead of PCR amplification Inspired by SLSim, the amplification rate allows users to repeat each template read a specified number of times. This is a simpler way to simulate amplification with: $$\\ x \\sim \\text{Poi}(\\text{amp_rate}) \\ $$ The value of x is set by the user using --amp_rate","title":"--total_reads"},{"location":"parameters/#error-model","text":"AsaruSim uses the Badread Python library to simulate nanopore sequencing errors and assign per-base quality scores based on pre-trained error models (see Badread documentation for more information). To do so, AsaruSim requires:","title":"Error model"},{"location":"parameters/#-trained_model","text":"This allows the user to choose one of the built-in error models within the Badread database. The possible values are: nanopore2023 : a model trained on ONT R10.4.1 reads from 2023 (the default). nanopore2020 : a model trained on ONT R9.4.1 reads from 2020. nanopore2018 : a model trained on ONT R9.4/R9.4.1 reads from 2018. random : a random error model with a 1/3 chance each of insertion, deletion, and substitution. a file path for a trained model.","title":"--trained_model"},{"location":"parameters/#-badread_identity","text":"Badread uses the Beta distribution to sample read identities. The distribution is defined with three parameters: mean , standard deviation , and maximum value. To pass these parameters to AsaruSim, use three comma-delimited values (identity mean, max, stdev). default : --badread_identity 95,99,2.5 .","title":"--badread_identity"},{"location":"parameters/#-build_model","text":"To internally train a personalized read identity , Qscore , and error models, AsaruSim requires a real FASTQ read file that can be provided using --model_fastq and a reference genome (.FASTA) file using --ref_genome . AsaruSim also accepts pre-built model files Users can use --error_model and --qscore_model to provide Badread pre-built models in file format.","title":"--build_model"}]}
\ No newline at end of file
diff --git a/docs/sitemap.xml.gz b/docs/sitemap.xml.gz
index 9374eaff789a54344710fd5cd0e96fc4b3130d88..ca57dcf001db1e73a2f083c73ac4e5fcadcb0886 100644
GIT binary patch
delta 13
Ucmb=gXP58h;CK)^VIq4403Ngil>h($

delta 13
Ucmb=gXP58h;CL{rVj_D503WRc!T<mO