diff --git a/.gitignore b/.gitignore index 5e8ac30..43a5fdf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,9 @@ bin manual/sondovac_manual.brf manual/sondovac_manual.kilepr -manual/sondovac_manual.tex.backup pkgs/linux32b pkgs/linux64b pkgs/macosx src *~ +*.backup diff --git a/.info b/.info index afef5d2..0b78fe7 100644 --- a/.info +++ b/.info @@ -1,2 +1,2 @@ -CURRENTVERSION=0.7 -NEWVERSION=https://github.com/V-Z/sondovac/releases/download/v0.7-alpha/sondovac-0.7-alpha.zip +CURRENTVERSION=0.8 +NEWVERSION=https://github.com/V-Z/sondovac/releases/download/v0.8-alpha/sondovac-0.8-alpha.zip diff --git a/CHANGELOG b/CHANGELOG index 81a3f6f..98c1380 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ Sondovač changelog -Version 0.8 alpha release 2015-09-09 +Version 0.8 alpha released 2015-10-09 ================================================================================ * Usage of mitochondrial reference sequence is optional @@ -8,7 +8,7 @@ Version 0.8 alpha release 2015-09-09 * Various fixes and enhancements -Version 0.7 alpha release 2015-09-06 +Version 0.7 alpha released 2015-10-06 ================================================================================ * Fixed reported problems with sed differences among Linux and Mac OS X @@ -16,7 +16,7 @@ Version 0.7 alpha release 2015-09-06 * Various fixes and enhancements -Version 0.6 alpha release 2015-08-10 +Version 0.6 alpha released 2015-08-10 ================================================================================ * Fixed problems with some versions of output of Geneious @@ -24,7 +24,7 @@ Version 0.6 alpha release 2015-08-10 * Various fixes and enhancements -Version 0.5 alpha release 2015-07-24 +Version 0.5 alpha released 2015-07-24 ================================================================================ * First public release, early alpha stage diff --git a/INSTALL b/INSTALL index eb31929..4a33109 100644 --- a/INSTALL +++ b/INSTALL @@ -1,4 +1,4 @@ -Sondovač 0.6 Install instructions +Sondovač 0.8 Install instructions Requirements to run Sondovač ================================================================================ diff --git a/LICENSE b/LICENSE index 25f3875..39bfc73 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Sondovač 0.6 Licenses +Sondovač 0.8 Licenses The set of BASH scripts Sondovač is licensed under GNU General Public License version 3. List of licenses of included software is in following table (see diff --git a/README b/README index 774f2e1..0c06a87 100644 --- a/README +++ b/README @@ -1,7 +1,8 @@ -Sondovač 0.6 Basic help +Sondovač 0.8 Basic help -Sondovač is a script to create orthologous low-copy nuclear probes from -transcriptome and genome skim data for target enrichment. +Sondovač (English pronunciation is "Sondovach".) is a script to create +orthologous low-copy nuclear probes from transcriptome and genome skim data for +target enrichment. Script summary @@ -333,8 +334,10 @@ Script sondovac_part_a.sh creates the following files: of plastid origin) 4) *_genome_skim_data_no_cp_reads* - Genome skim data without cpDNA reads 5) *_genome_skim_data_no_cp_no_mt_reads.bam - SAM converted to BAM (removal of - reads of mitochondrial origin) -6) *_genome_skim_data_no_cp_no_mt_reads* - Genome skim data without mtDNA reads + reads of mitochondrial origin) - only if mitochondriome reference + sequence was used +6) *_genome_skim_data_no_cp_no_mt_reads* - Genome skim data without mtDNA + reads - only if mitochondriome reference sequence was used 7) *_combined_reads_co_cp_no_mt_reads* - Combined paired-end genome skim reads 8) *_blat_unique_transcripts_versus_genome_skim_data.pslx - Output of BLAT (matching of the unique transcripts and the filtered, combined genome @@ -364,7 +367,7 @@ Script sondovac_part_b.sh creates following files: 2) *_similarity_test2 - Contigs that comprise exons ≥ bait length and have a certain total locus length (in the Oxalis case ≥600 bp) 3) *_target_enrichment_probe_sequences.fasta - Probes in FASTA -4) *.target_enrichment_probe_sequences_final.pslx - Final probes ready for bait +4) *.possible_cp_dna_genes_in_probe_set.pslx - Final probes ready for bait synthesis @@ -409,6 +412,20 @@ Leebens-Mack J and Wong G K-S (2014) Data access for the 1,000 Plants (1KP) project. GigaScience 3:17, http://www.gigasciencejournal.com/content/3/1/17/ +Record output of Sondovač +================================================================================ + +To record whole output of Sondovač script (regardless parameters used) use +utility "tee". It will produce plain text output with everything printed to the +screen. It can be useful for reference or explorations is something went wrong. +Use it as follows: + +./sondovac_part_a.sh | tee records.log + +You can use any command line arguments, script will behave as usually. Plain +text file "records.log" will then contain all its output. + + License ================================================================================ diff --git a/geneious_column_separator.pl b/geneious_column_separator.pl index c1c488b..6d27288 100755 --- a/geneious_column_separator.pl +++ b/geneious_column_separator.pl @@ -9,6 +9,7 @@ # Read provided file name my $GENEIOUSFILE = $ARGV[0]; my $GENEIOUSFILENAME = basename($GENEIOUSFILE, ".tsv"); + # Check if it is readable open(FH, "< $GENEIOUSFILE") || die "Cannot open $GENEIOUSFILE for reading: $!"; my @array = ; diff --git a/manual/sondovac_manual.bib b/manual/sondovac_manual.bib index 705a63d..f002ac0 100644 --- a/manual/sondovac_manual.bib +++ b/manual/sondovac_manual.bib @@ -1,6 +1,6 @@ -@article{Meintjes2012, +@article{Kearse2012, abstract = {Summary: The two main functions of bioinformatics are the organization and analysis of biological data using computational resources. Geneious Basic has been designed to be an easy- to-use and flexible desktop software application framework for the organization and analysis of biological data, with a focus on molecular sequences and related data types. It integrates numerous industry-standard discovery analysis tools, with interactive visualizations to generate publication-ready images. One key contribution to researchers in the life sciences is the Geneious public application programming interface (API) that affords the ability to leverage the existing framework of the Geneious Basic software platform for virtually unlimited extension and customization. The result is an increase in the speed and quality of development of computation tools for the life sciences, due to the functionality and graphical user interface available to the developer through the public API. Geneious Basic represents an ideal platform for the bioinformatics community to leverage existing components and to integrate their own specific requirements for the discovery, analysis and visualization of biological data. Availability and implementation: Binaries and public API freely available for download at http://www.geneious.com/basic, implemented in Java and supported on Linux, Apple OSX and MS Windows. The software is also available from the Bio-Linux package repository at http://nebc.nerc.ac.uk/news/geneiousonbl. Contact: peter@biomatters.com Received}, -author = {Meintjes, P. and Kearse, Matthew and Moir, Richard and Wilson, Amy and Stones-Havas, Steven and Cheung, Matthew and Sturrock, Shane and Buxton, Simon and Cooper, Alex and Markowitz, Sidney and Duran, Chris and Thierer, Tobias and Asthon, Bruce and Meintjes, Peter and Drummond, Alexei J}, +author = {Kearse, Matthew and Moir, Richard and Wilson, Amy and Stones-Havas, Steven and Cheung, Matthew and Sturrock, Shane and Buxton, Simon and Cooper, Alex and Markowitz, Sidney and Duran, Chris and Thierer, Tobias and Asthon, Bruce and Meintjes, Peter and Drummond, Alexei J.}, doi = {10.1093/bioinformatics/bts199}, file = {:home/vojta/dokumenty/clanky/pdf/Kearse\_etal\_2012\_Geneious\_Basic-integrated\_and\_extendable\_desktop\_software\_platform\_for\_organization\_and\_anal\_of\_seq\_data.pdf:pdf}, issn = {1367-4803}, @@ -9,14 +9,14 @@ @biomatters.com number = {12}, pages = {1647--1649}, title = {{Geneious Basic: An integrated and extendable desktop software platform for the organization and analysis of sequence data}}, -url = {http://bioinformatics.oxfordjournals.org/cgi/doi/10.1093/bioinformatics/bts199}, +url = {http://bioinformatics.oxfordjournals.org/content/28/12/1647}, volume = {28}, year = {2012} } @article{Kent2002, abstract = {Analyzing vertebrate genomes requires rapid mRNA/DNA and cross-species protein alignments. A new tool, BLAT, is more accurate and 500 times faster than popular existing tools for mRNA/DNA alignments and 50 times faster for protein alignments at sensitivity settings typically used when comparing vertebrate sequences. BLAT’s speed stems from an index of all nonoverlapping K-mers in the genome. This index fits inside the RAM of inexpensive computers, and need only be computed once for each genome assembly. BLAT has several major stages. It uses the index to find regions in the genome likely to be homologous to the query sequence. It performs an alignment between homologous regions. It stitches together these aligned regions (often exons) into larger alignments (typically genes). Finally, BLAT revisits small internal exons possibly missed at the first stage and adjusts large gap boundaries that have canonical splice sites where feasible. This paper describes how BLAT was optimized. Effects on speed and sensitivity are explored for various K-mer sizes, mismatch schemes, and number of required index matches. BLAT is compared with other alignment programs on various test sets and then used in several genome-wide applications. http://genome.ucsc.edu hosts a web-based BLAT server for the human genome.}, -author = {Kent, W James}, +author = {Kent, W. James}, doi = {10.1101/gr.229202.}, file = {:home/vojta/dokumenty/clanky/pdf/Kent\_2002\_BLAT-BLAST-Like\_Alignment\_Tool.pdf:pdf}, issn = {1088-9051}, @@ -30,7 +30,8 @@ @article{Kent2002 @article{Langmead2012, abstract = {As the rate of sequencing increases, greater throughput is demanded from read aligners. The full-text minute index is often used to make alignment very fast and memory-efficient, but the approach is ill-suited to finding longer, gapped alignments. Bowtie 2 combines the strengths of the full-text minute index with the flexibility and speed of hardware-accelerated dynamic programming algorithms to achieve a combination of high speed, sensitivity and accuracy.}, -author = {Langmead, Ben and Salzberg, Steven L}, +author = {Langmead, Ben and Salzberg, Steven L.}, +doi={10.1038/nmeth.1923}, issn = {1548-7091}, journal = {Nature Methods}, month = apr, @@ -38,7 +39,7 @@ @article{Langmead2012 pages = {357--359}, publisher = {Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.}, title = {{Fast gapped-read alignment with Bowtie 2}}, -url = {http://dx.doi.org/10.1038/nmeth.1923 10.1038/nmeth.1923 http://www.nature.com/nmeth/journal/v9/n4/abs/nmeth.1923.html\#supplementary-information}, +url = {http://www.nature.com/nmeth/journal/v9/n4/abs/nmeth.1923.html}, volume = {9}, year = {2012} } @@ -111,7 +112,7 @@ @article{Fu2012 @article{Huang2010, abstract = {Summary: CD-HIT is a widely used program for clustering and comparing large biological sequence datasets. In order to further assist the CD-HIT users, we significantly improved this program with more functions and better accuracy, scalability and flexibility. Most importantly, we developed a new web server, CD-HIT Suite, for clustering a user-uploaded sequence dataset or comparing it to another dataset at different identity levels. Users can now interactively explore the clusters within web browsers. We also provide downloadable clusters for several public databases (NCBI NR, Swissprot and PDB) at different identity levels. Availability: Free access at http://cd-hit.org Contact: liwz@sdsc.edu Supplementary information: Supplementary data are available at Bioinformatics online. Received}, -author = {Huang, Y. and Niu, B. and Gao, Y. and Fu, L. and Li, W.}, +author = {Huang, Ying and Niu, Beifang and Gao, Ying and Fu, Limin and Li, Weizhong}, doi = {10.1093/bioinformatics/btq003}, file = {:home/vojta/dokumenty/clanky/pdf/Huang\_etal\_2010\_CD-HIT\_Suite-web\_server\_for\_clustering\_and\_comparing\_biological\_sequences.pdf:pdf}, issn = {1367-4803}, @@ -119,7 +120,7 @@ @sdsc.edu number = {5}, pages = {680--682}, title = {{CD-HIT Suite: a web server for clustering and comparing biological sequences}}, -url = {http://bioinformatics.oxfordjournals.org/cgi/doi/10.1093/bioinformatics/btq003}, +url = {http://bioinformatics.oxfordjournals.org/content/26/5/680}, volume = {26}, year = {2010} } @@ -134,14 +135,15 @@ @article{Niu2010 journal = {BMC bioinformatics}, pages = {187}, pmid = {20388221}, -title = {{Artificial and natural duplicates in pyrosequencing reads of metagenomic data.}}, +title = {{Artificial and natural duplicates in pyrosequencing reads of metagenomic data}}, +url = {http://www.biomedcentral.com/1471-2105/11/187/}, volume = {11}, year = {2010} } @article{Li2012b, abstract = {The rapid advances of high-throughput sequencing technologies dramatically prompted metagenomic studies of microbial communities that exist at various environments. Fundamental questions in metagenomics include the identities, composition and dynamics of microbial populations and their functions and interactions. However, the massive quantity and the comprehensive complexity of these sequence data pose tremendous challenges in data ana- lysis. These challenges include but are not limited to ever-increasing computational demand, biased sequence sam- pling, sequence errors, sequence artifacts and novel sequences. Sequence clustering methods can directly answer many of the fundamental questions by grouping similar sequences into families. In addition, clustering analysis also addresses the challenges in metagenomics. Thus, a large redundant data set can be represented with a small non-redundant set, where each cluster can be represented by a single entry or a consensus. Artifacts can be rapidly detected through clustering. Errors can be identified, filtered or corrected by using consensus from sequences within clusters.}, -author = {Li, W. and Fu, L. and Niu, B. and Wu, S. and Wooley, J.}, +author = {Li, Weizhong and Fu, Limin and Niu, Beifang and Wu, Sitao and Wooley, John}, doi = {10.1093/bib/bbs035}, file = {:home/vojta/dokumenty/clanky/pdf/Li\_etal\_2012\_Ultrafast\_clustering\_algorithms\_for\_metagenomic\_sequence\_analysis.pdf:pdf}, issn = {1467-5463}, @@ -151,7 +153,7 @@ @article{Li2012b number = {6}, pages = {656--668}, title = {{Ultrafast clustering algorithms for metagenomic sequence analysis}}, -url = {http://bib.oxfordjournals.org/cgi/doi/10.1093/bib/bbs035}, +url = {http://bib.oxfordjournals.org/content/13/6/656}, volume = {13}, year = {2012} } @@ -166,7 +168,7 @@ @misc{Gordon2010 @article{Magoc2011, abstract = {Motivation: Next-generation sequencing technologies generate very large numbers of short reads. Even with very deep genome coverage, short read lengths cause problems in de novo assemblies. The use of paired-end libraries with a fragment size shorter than twice the read length provides an opportunity to generate much longer reads by overlapping and merging read pairs before assembling a genome. Results: We present FLASH, a fast computational tool to extend the length of short reads by overlapping paired-end reads from fragment libraries that are sufficiently short. We tested the correctness of the tool on one million simulated read pairs, and we then applied it as a pre-processor for genome assemblies of Illumina reads from the bacterium Staphylococcus aureus and human chromosome 14. FLASH correctly extended and merged reads >99\% of the time on simulated reads with an error rate of <1\%. With adequately set parameters, FLASH correctly merged reads over 90\% of the time even when the reads contained up to 5\% errors. When FLASH was used to extend reads prior to assembly, the resulting assemblies had substantially greater N50 lengths for both contigs and scaffolds. Availability and Implementation: The FLASH system is implemented in C and is freely available as open-source code at http://www.cbcb.umd.edu/software/flash. Contact: t.magoc@gmail.com}, -author = {Magoc, T. and Salzberg, S. L.}, +author = {Magoč, Tanja and Salzberg, Steven L.}, doi = {10.1093/bioinformatics/btr507}, file = {:home/vojta/dokumenty/clanky/pdf/Magoc\_et\_Salzberg\_2011\_FLASH-fast\_length\_adjustment\_of\_short\_reads\_to\_improve\_genome\_assemblies.pdf:pdf}, issn = {1367-4803}, @@ -174,7 +176,7 @@ @gmail.com number = {21}, pages = {2957--2963}, title = {{FLASH: fast length adjustment of short reads to improve genome assemblies}}, -url = {http://bioinformatics.oxfordjournals.org/cgi/doi/10.1093/bioinformatics/btr507}, +url = {http://bioinformatics.oxfordjournals.org/content/27/21/2957}, volume = {27}, year = {2011} } @@ -191,14 +193,14 @@ @article{Li2009 pages = {2078--2079}, pmid = {19505943}, title = {{The Sequence Alignment/Map format and SAMtools}}, -url = {http://bioinformatics.oxfordjournals.org/cgi/content/full/25/16/2078}, +url = {http://bioinformatics.oxfordjournals.org/content/25/16/2078.full}, volume = {25}, year = {2009} } @article{Li2011, -abstract = {Motivation: Most existing methods for DNA sequence analysis rely on accurate sequences or genotypes. However, in applications of the next-generation sequencing (NGS), accurate genotypes may not be easily obtained (e.g. multi-sample low-coverage sequencing or somatic mutation discovery). These applications press for the development of new methods for analyzing sequence data with uncertainty. Results: We present a statistical framework for calling SNPs, discovering somatic mutations, inferring population genetical parameters and performing association tests directly based on sequencing data without explicit genotyping or linkage-based imputation. On real data, we demonstrate that our method achieves comparable accuracy to alternative methods for estimating site allele count, for inferring allele frequency spectrum and for association mapping. We also highlight the necessity of using symmetric datasets for finding somatic mutations and confirm that for discovering rare events, mismapping is frequently the leading source of errors. Availability: http://samtools.sourceforge.net Contact: hengli@broadinstitute.org}, -author = {Li, H.}, +abstract = {Motivation: Most existing methods for DNA sequence analysis rely on accurate sequences or genotypes. However, in applications of the next-generation sequencing (NGS), accurate genotypes may not be easily obtained (e.g. multi-sample low-coverage sequencing or somatic mutation discovery). These applications press for the development of new methods for analyzing sequence data with uncertainty. Results: We present a statistical framework for calling SNPs, discovering somatic mutations, inferring population genetical parameters and performing association tests directly based on sequencing data without explicit genotyping or linkage-based imputation. On real data, we demonstrate that our method achieves comparable accuracy to alternative methods for estimating site allele count, for inferring allele frequency spectrum and for association mapping. We also highlight the necessity of using symmetric datasets for finding somatic mutations and confirm that for discovering rare events, mismapping is frequently the leading source of errors.}, +author = {Li, Heng}, doi = {10.1093/bioinformatics/btr509}, file = {:home/vojta/dokumenty/clanky/pdf/Li\_2009\_statistical\_framework\_for\_SNP\_calling\_mutation\_discovery\_association\_mapping\_and\_population\_genetical\_parameter\_estimation\_from\_sequencing\_data.pdf:pdf}, issn = {1367-4803}, @@ -206,14 +208,14 @@ @broadinstitute.org number = {21}, pages = {2987--2993}, title = {{A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data}}, -url = {http://bioinformatics.oxfordjournals.org/cgi/doi/10.1093/bioinformatics/btr509}, +url = {http://bioinformatics.oxfordjournals.org/content/27/21/2987}, volume = {27}, year = {2011} } @article{Li2011a, abstract = {Summary: I propose a new application of profile Hidden Markov Models in the area of SNP discovery from resequencing data, to greatly reduce false SNP calls caused by misalignments around insertions and deletions (indels). The central concept is per-Base Alignment Quality, which accurately measures the probability of a read base being wrongly aligned. The effectiveness of BAQ has been positively confirmed on large datasets by the 1000 Genomes Project analysis subgroup. Availability: http://samtools.sourceforge.net Contact: hengli@broadinstitute.org}, -author = {Li, H.}, +author = {Li, Heng}, doi = {10.1093/bioinformatics/btr076}, file = {:home/vojta/dokumenty/clanky/pdf/Li\_2011\_Improving\_SNP\_discovery\_by\_base\_alignment\_quality.pdf:pdf}, issn = {1367-4803}, @@ -221,14 +223,14 @@ @broadinstitute.org number = {8}, pages = {1157--1158}, title = {{Improving SNP discovery by base alignment quality}}, -url = {http://bioinformatics.oxfordjournals.org/cgi/doi/10.1093/bioinformatics/btr076}, +url = {http://bioinformatics.oxfordjournals.org/content/27/8/1157}, volume = {27}, year = {2011} } @article{Straub2012, abstract = {Premise of the study: Just as Sanger sequencing did more than 20 years ago, next-generation sequencing (NGS) is poised to revolutionize plant systematics. By combining multiplexing approaches with NGS throughput, systematists may no longer need to choose between more taxa or more characters. Here we describe a genome skimming (shallow sequencing) approach for plant systematics. • Methods: Through simulations, we evaluated optimal sequencing depth and performance of single-end and paired-end short read sequences for assembly of nuclear ribosomal DNA (rDNA) and plastomes and addressed the effect of divergence on reference-guided plastome assembly. We also used simulations to identify potential phylogenetic markers from low-copy nuclear loci at different sequencing depths. We demonstrated the utility of genome skimming through phylogenetic analysis of the Sonoran Desert clade (SDC) of Asclepias (Apocynaceae). • Key results: Paired-end reads performed better than single-end reads. Minimum sequencing depths for high quality rDNA and plastome assemblies were 40 × and 30 × , respectively. Divergence from the reference signifi cantly affected plastome assembly, but relatively similar references are available for most seed plants. Deeper rDNA sequencing is necessary to characterize intragenomic polymorphism. The low-copy fraction of the nuclear genome was readily surveyed, even at low sequencing depths. Nearly 160 000 bp of sequence from three organelles provided evidence of phylogenetic incongruence in the SDC. • Conclusions: Adoption of NGS will facilitate progress in plant systematics, as whole plastome and rDNA cistrons, partial mitochondrial genomes, and low-copy nuclear markers can now be effi ciently obtained for molecular phylogenetics studies.}, -author = {Straub, Shannon C K and Parks, Matthew and Weitemier, Kevin and Fishbein, Mark and Cronn, Richard C and Liston, Aaron}, +author = {Straub, Shannon C. K. and Parks, Matthew and Weitemier, Kevin and Fishbein, Mark and Cronn, Richard C. and Liston, Aaron}, doi = {10.3732/ajb.1100335}, file = {:home/vojta/dokumenty/clanky/pdf/Straub\_etal\_2012\_Navigating\_tip\_of\_genomic\_iceberg-next-gen\_seq\_for\_plant\_syst.pdf:pdf}, issn = {1537-2197}, @@ -247,7 +249,7 @@ @article{Straub2012 @article{Matasci2014, abstract = {The 1,000 plants (1KP) project is an international multi-disciplinary consortium that has generated transcriptome data from over 1,000 plant species, with exemplars for all of the major lineages across the Viridiplantae (green plants) clade. Here, we describe how to access the data used in a phylogenomics analysis of the first 85 species, and how to visualize our gene and species trees. Users can develop computational pipelines to analyse these data, in conjunction with data of their own that they can upload. Computationally estimated protein-protein interactions and biochemical pathways can be visualized at another site. Finally, we comment on our future plans and how they fit within this scalable system for the dissemination, visualization, and analysis of large multi-species data sets.}, -author = {Matasci, Naim and Hung, Ling-Hong and Yan, Zhixiang and Carpenter, Eric J and Wickett, Norman J and Mirarab, Siavash and Nguyen, Nam and Warnow, Tandy and Ayyampalayam, Saravanaraj and Barker, Michael and Burleigh, J and Gitzendanner, Matthew a and Wafula, Eric and Der, Joshua P and DePamphilis, Claude W and Roure, B\'{e}atrice and Philippe, Herv\'{e} and Ruhfel, Brad R and Miles, Nicholas W and Graham, Sean W and Mathews, Sarah and Surek, Barbara and Melkonian, Michael and Soltis, Douglas E and Soltis, Pamela S and Rothfels, Carl and Pokorny, Lisa and Shaw, Jonathan a and DeGironimo, Lisa and Stevenson, Dennis W and Villarreal, Juan and Chen, Tao and Kutchan, Toni M and Rolf, Megan and Baucom, Regina S and Deyholos, Michael K and Samudrala, Ram and Tian, Zhijian and Wu, Xiaolei and Sun, Xiao and Zhang, Yong and Wang, Jun and Leebens-Mack, Jim and Wong, Gane Ka-Shu}, +author = {Matasci, Naim and Hung, Ling-Hong and Yan, Zhixiang and Carpenter, Eric J. and Wickett, Norman J. and Mirarab, Siavash and Nguyen, Nam and Warnow, Tandy and Ayyampalayam, Saravanaraj and Barker, Michael and Burleigh, J. and Gitzendanner, Matthew A. and Wafula, Eric and Der, Joshua P. and DePamphilis, Claude W. and Roure, B\'{e}atrice and Philippe, Herv\'{e} and Ruhfel, Brad R. and Miles, Nicholas W. and Graham, Sean W. and Mathews, Sarah and Surek, Barbara and Melkonian, Michael and Soltis, Douglas E. and Soltis, Pamela S. and Rothfels, Carl and Pokorny, Lisa and Shaw, Jonathan A. and DeGironimo, Lisa and Stevenson, Dennis W. and Villarreal, Juan and Chen, Tao and Kutchan, Toni M. and Rolf, Megan and Baucom, Regina S. and Deyholos, Michael K. and Samudrala, Ram and Tian, Zhijian and Wu, Xiaolei and Sun, Xiao and Zhang, Yong and Wang, Jun and Leebens-Mack, Jim and Wong, Gane Ka-Shu}, doi = {10.1186/2047-217X-3-17}, file = {:home/vojta/dokumenty/clanky/pdf/Matasci\_etal\_2014\_Data\_access\_for\_1000\_Plants-1KP-project.pdf:pdf}, issn = {2047-217X}, diff --git a/manual/sondovac_manual.pdf b/manual/sondovac_manual.pdf index 10e9821..fabefce 100644 Binary files a/manual/sondovac_manual.pdf and b/manual/sondovac_manual.pdf differ diff --git a/manual/sondovac_manual.tex b/manual/sondovac_manual.tex index 82e22c6..5c6577f 100644 --- a/manual/sondovac_manual.tex +++ b/manual/sondovac_manual.tex @@ -4,20 +4,13 @@ % * Geneious screenshots % * More BASH examples (installation of packages in Linux), some screenshot % * Help for Mac OS X - install build tools, homebrew -% * Final formatting \usepackage{xltxtra} \usepackage{xunicode} \usepackage{polyglossia} % Modify page margins -\addtolength{\hoffset}{-0.5cm} % Enlarge page to left -\setlength{\evensidemargin}{0cm} % Move left margin of even pages left -\setlength{\oddsidemargin}{1cm} % Move right margin of odd pages right -\setlength{\marginparwidth}{2cm} % Left margin -\setlength{\textwidth}{16cm} % Text width (width of A4 is 21 cm - maximal width of graphics) -\addtolength{\voffset}{-1cm} % Move text up on a page -\setlength{\textheight}{23cm} % Text height (height of A4 is 29.7 cm - maximal height of graphics) +\usepackage[a4paper, text={160mm, 250mm}]{geometry} % Better captions of tables and figures \usepackage[small, bf]{caption} @@ -26,7 +19,9 @@ \usepackage{enumitem} % Better bibliography -\usepackage{natbib} +\usepackage[round, semicolon, authoryear]{natbib} +% \citet{} > Author (year); \citep{} > (Author, year); natbib: \citet/p[text][p.~1]{} > text Author year, p. 1; with * lists all authors +\bibliographystyle{plainnat} % Various logos \usepackage{dtklogos} @@ -40,15 +35,19 @@ \setdefaultlanguage{english} % Opening -\title{Manual for Sondovač 0.7} +\title{Manual for Sondovač 0.8} \author{Roswitha Schmickl, Aaron Liston, Vojtěch Zeisek and others} +% Allow line breaks within URLs +\usepackage[hyphens]{url} + +% Clickable navigation within document \usepackage[ bookmarks=true, unicode=true, colorlinks=true, pagebackref=true, - pdftitle={Sondovac manual 0.7}, + pdftitle={Sondovac manual 0.8}, plainpages=false, pdfauthor={Roswitha Schmickl, Aaron Liston, Vojtech Zeisek and others}, pdfsubject={Software manual}, @@ -67,19 +66,21 @@ % Title page \maketitle -Sondovač is a script to create orthologous low-copy nuclear probes from transcriptome and genome skim data for target enrichment. +Sondovač\footnote{English pronunciation is "Sondovach".} is a script to create orthologous low-copy nuclear probes from transcriptome and genome skim data for target enrichment. + +\begin{abstract} +Phylogenetics benefits from using a large number of putatively independent nuclear loci and the combination with other datasets such as the plastid and mitochondrial genome. Selecting such orthologous low-copy nuclear (LCN) loci is still a challenge for non-model organisms. Meanwhile it is common to select LCN genes for phylogenies based on a comparison of transcriptomes, but automated bioinformatic pipelines for the selection of those genes are largely absent. We created a user-friendly, automated and interactive script named Sondovač to LCN loci by a comparison between transcriptome and genome skim data. The script is licensed under open-source license GPL v.3 allowing further modifications. The script runs on major Linux distributions and Mac OS X. Strong bioinformatic skills and access to high-performance computer clusters are not required; it runs on a standard desktop computer equipped with modern CPU like Intel i5 or i7 within several hours. +\end{abstract} + +% Contents \tableofcontents \listoffigures \listoftables \vskip 1cm -% Document body starts +% Document main text starts -\framebox{\parbox{15cm}{Sondovač is a script to create orthologous low-copy nuclear probes from transcriptome and genome skim data for target enrichment. For information and download see \href{https://github.com/V-Z/sondovac/wiki}{https://github.com/V-Z/sondovac/wiki}. For paper introducing Sondovač see Schmickl et al.}} - -\begin{abstract} -Phylogenetics benefits from using a large number of putatively independent nuclear loci and the combination with other datasets such as the plastid and mitochondrial genome. Selecting such orthologous low-copy nuclear (LCN) loci is still a challenge for non-model organisms. Meanwhile it is common to select LCN genes for phylogenies based on a comparison of transcriptomes, but automated bioinformatic pipelines for the selection of those genes are largely absent. We created a user-friendly, automated and interactive script named Sondovač to LCN loci by a comparison between transcriptome and genome skim data. The script is licensed under open-source license GPL v.3 allowing further modifications. The script runs on major Linux distributions and Mac OS X. Strong bioinformatic skills and access to high-performance computer clusters are not required; it runs on a standard desktop computer equipped with modern CPU like Intel i5 or i7 within several hours. -\end{abstract} +\framebox{\parbox{15cm}{Sondovač is a script to create orthologous low-copy nuclear probes from transcriptome and genome skim data for target enrichment. For information and download see \href{https://github.com/V-Z/sondovac/wiki}{https://github.com/V-Z/sondovac/wiki}.}} \section{Introduction} @@ -99,7 +100,7 @@ \subsection{Pipeline -- how the data are processed} \begin{figure}[p] \begin{center} -\includegraphics[width=15cm]{pipeline_workflow_grayscale.png} +\includegraphics[width=14cm]{pipeline_workflow_grayscale.png} \end{center} \caption[Workflow of the probe design script Sondovač]{Workflow of the probe design script Sondovač. An overview on the main steps of Hyb-Seq are given in the top part of the figure; probe design is the first one. Each step of Sondovač is numbered and illustrated by three boxes each: Software is highlighted in dark gray, a summary of each step is given in medium gray, and input/output of each step is depicted in light gray. Optional removal of reads of mitochondrial origin from the genome skim data is marked by decoloration of the text. The required input files of Sondovač are highlighted in bold. The direction of the workflow is indicated by arrows.} \label{pipeline-workflow} @@ -274,9 +275,9 @@ \subsection{Help for usage of terminal} \subsection{Geneious} \label{geneious} -For the part (2) user must have Geneious \citep{Meintjes2012}. Geneious is a DNA alignment, assembly, and analysis software and one of the most common software platforms used in genomics. It is utilized for de novo assembly in Sondovač. We plan to replace it by some free open-source command line tool in some future release of Sondovač. Visit \href{http://www.geneious.com/}{http://www.geneious.com/} for download, purchase, installation and usage of Geneious. It is very feature-rich application. The input data are processed (interactively or not) by \texttt{sondovac$\_$part$\_$a.sh} and then user must process its output manually by Geneious according instructions below. Output of Geneious is then processed by \texttt{sondovac$\_$part$\_$b.sh}, which produce final probe set. Geneious was tested with versions 6, 7 and 8. +For the part (2) user must have Geneious \citep{Kearse2012}. Geneious is a DNA alignment, assembly, and analysis software and one of the most common software platforms used in genomics. It is utilized for de novo assembly in Sondovač. We plan to replace it by some free open-source command line tool in some future release of Sondovač. Visit \href{http://www.geneious.com/}{http://www.geneious.com/} for download, purchase, installation and usage of Geneious. It is very feature-rich application. The input data are processed (interactively or not) by \texttt{sondovac$\_$part$\_$a.sh} and then user must process its output manually by Geneious according instructions below. Output of Geneious is then processed by \texttt{sondovac$\_$part$\_$b.sh}, which produce final probe set. Geneious was tested with versions 6, 7 and 8. -Import output file of part A of the script (\texttt{sondovac$\_$part$\_$a.sh}): go to menu \textbf{File | Import | From File\ldots} This file is named as:\\YourInputFile$\_$blat$\_$unique$\_$transcripts$\_$versus$\_$genome$\_$skim$\_$data-no$\_$missing$\_$fin.fsa +Import output file of part A of the script (\texttt{sondovac$\_$part$\_$a.sh}): go to menu \textbf{File | Import | From File\ldots} This file is named as: \texttt{YourInputFile$\_$\allowbreak blat$\_$\allowbreak unique$\_$\allowbreak transcripts$\_$\allowbreak versus$\_$\allowbreak genome$\_$\allowbreak skim$\_$\allowbreak data-no$\_$\allowbreak missing$\_$\allowbreak fin.fsa} Select the file and go to menu \textbf{Tools | Align / Assemble | De Novo Assemble}. In \textbf{Data} frame select \textbf{Assembe by 1st (\ldots) Underscore}. In \textbf{Method} frame select \textbf{Geneious Assembler} (if you don't have other assemblers, this option might be missing) and \textbf{Medium Sensitivity / Fast Sensitivity} @@ -312,7 +313,7 @@ \subsection{Software used by Sondovač} libpng & 1.6.X & \href{http://www.libpng.org/}{http://www.libpng.org/}\\ Picard & v. > 1.137 & \href{https://broadinstitute.github.io/picard/}{https://broadinstitute.github.io/picard/}\\ SAMtools, htsjdk & 1.2 & \href{http://www.htslib.org/}{http://www.htslib.org/}\\ -Sondovač & 0.7 & \href{https://github.com/V-Z/sondovac/wiki}{https://github.com/V-Z/sondovac/wiki}\\ +Sondovač & 0.8 & \href{https://github.com/V-Z/sondovac/wiki}{https://github.com/V-Z/sondovac/wiki}\\ zlib & 1.2.8 & \href{http://zlib.net/}{http://zlib.net/} \end{tabular} \label{software-links} @@ -336,29 +337,31 @@ \subsection{Software used by Sondovač} \item BLAT \end{itemize} +Papers describing the software used by Sondovač: + \begin{description} - \item[BLAT] \citet{Kent2002}: BLAT -- the BLAST-like alignment tool, \href{http://genome.cshlp.org/content/12/4/656.short}{http://genome.cshlp.org/content/12/4/656.short} - \item[Bowtie2] \citet{Langmead2012}: Fast gapped-read alignment with Bowtie 2, \href{http://www.nature.com/nmeth/journal/v9/n4/full/nmeth.1923.html}{http://www.nature.com/nmeth/journal/v9/n4/full/nmeth.1923.html} + \item[BLAT] \citet{Kent2002}: BLAT -- the BLAST-like alignment tool. + \item[Bowtie2] \citet{Langmead2012}: Fast gapped-read alignment with Bowtie 2. \item[CD-HIT] There are several papers describing CD-HIT: \begin{itemize} - \item \citet{Li2001}: Clustering of highly homologous sequences to reduce the size of large protein databases, \href{http://bioinformatics.oxfordjournals.org/content/17/3/282.short}{http://bioinformatics.oxfordjournals.org/content/17/3/282.short} - \item \citet{Li2002}: Tolerating some redundancy significantly speeds up clustering of large protein databases, \href{http://bioinformatics.oxfordjournals.org/content/18/1/77.short}{http://bioinformatics.oxfordjournals.org/content/18/1/77.short} - \item \citet{Li2006}: Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences, \href{http://bioinformatics.oxfordjournals.org/content/22/13/1658.short}{http://bioinformatics.oxfordjournals.org/content/22/13/1658.short} - \item \citet{Fu2012}: CD-HIT: accelerated for clustering the next generation sequencing data, \href{http://bioinformatics.oxfordjournals.org/content/28/23/3150.short}{http://bioinformatics.oxfordjournals.org/content/28/23/3150.short} - \item \citet{Huang2010}: CD-HIT Suite: a web server for clustering and comparing biological sequences, \href{http://bioinformatics.oxfordjournals.org/content/26/5/680.short}{http://bioinformatics.oxfordjournals.org/content/26/5/680.short} - \item \citet{Niu2010}: Artificial and natural duplicates in pyrosequencing reads of metagenomic data, \href{http://www.biomedcentral.com/1471-2105/11/187}{http://www.biomedcentral.com/1471-2105/11/187} - \item \citet{Li2012b}: Ultrafast clustering algorithms for metagenomic sequence analysis, \href{http://bib.oxfordjournals.org/content/13/6/656.abstract}{http://bib.oxfordjournals.org/content/13/6/656.abstract} + \item \citet{Li2001}: Clustering of highly homologous sequences to reduce the size of large protein databases. + \item \citet{Li2002}: Tolerating some redundancy significantly speeds up clustering of large protein databases. + \item \citet{Li2006}: Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences. + \item \citet{Fu2012}: CD-HIT: accelerated for clustering the next generation sequencing data. + \item \citet{Huang2010}: CD-HIT Suite: a web server for clustering and comparing biological sequences. + \item \citet{Niu2010}: Artificial and natural duplicates in pyrosequencing reads of metagenomic data. + \item \citet{Li2012b}: Ultrafast clustering algorithms for metagenomic sequence analysis. \end{itemize} - \item[FASTX toolkit] \citet{Gordon2010}: FASTX-Toolkit. FASTQ/A short-reads pre-processing tools, \href{http://hannonlab.cshl.edu/fastx_toolkit/}{http://hannonlab.cshl.edu/fastx$\_$toolkit/} - \item[FLASH] \citet{Magoc2011}: FLASH: fast length adjustment of short reads to improve genome assemblies, \href{http://bioinformatics.oxfordjournals.org/content/27/21/2957.abstract}{http://bioinformatics.oxfordjournals.org/content/27/21/2957.abstract} - \item[Geneious] \citet{Meintjes2012}: Geneious Basic: An integrated and extendable desktop software platform for the organization and analysis of sequence data, \href{http://bioinformatics.oxfordjournals.org/cgi/doi/10.1093/bioinformatics/bts199}{http://bioinformatics.oxfordjournals.org/cgi/doi/10.1093/bioinformatics/bts199} + \item[FASTX toolkit] \citet{Gordon2010}: FASTX-Toolkit. FASTQ/A short-reads pre-processing tools. + \item[FLASH] \citet{Magoc2011}: FLASH: fast length adjustment of short reads to improve genome assemblies. + \item[Geneious] \citet{Kearse2012}: Geneious Basic: An integrated and extendable desktop software platform for the organization and analysis of sequence data. \item[SAMtools] There are several papers describing SAMtools: \begin{itemize} - \item \citet{Li2009}: The Sequence alignment/map (SAM) format and SAMtools, \href{http://bioinformatics.oxfordjournals.org/content/25/16/2078.abstract}{http://bioinformatics.oxfordjournals.org/content/25/16/2078.abstract} - \item \citet{Li2011}: A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data, \href{http://bioinformatics.oxfordjournals.org/content/27/21/2987.abstract}{http://bioinformatics.oxfordjournals.org/content/27/21/2987.abstract} - \item \citet{Li2011a}: Improving SNP discovery by base alignment quality, \href{http://bioinformatics.oxfordjournals.org/content/27/8/1157.short}{http://bioinformatics.oxfordjournals.org/content/27/8/1157.short} + \item \citet{Li2009}: The Sequence alignment/map (SAM) format and SAMtools. + \item \citet{Li2011}: A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. + \item \citet{Li2011a}: Improving SNP discovery by base alignment quality. \end{itemize} - \item[Sondovač] Schmickl et al + \item[Sondovač] XXX \end{description} \subsection{Vocabulary} @@ -366,20 +369,20 @@ \subsection{Vocabulary} \begin{description} \item[Binary] An application in form understandable by the computer, but usually not transferable among operating systems and/or hardware platforms. Binaries in Windows usually have extension *.exe, in UNIX there use to be no extension. \item[BASH] "The command line" -- fully featured programming scripting language accessible through terminal of any UNIX-based operating system (any Linux, Mac OS X, Solaris, any variant of BSD and more). BASH scripts usually have extension *.sh. -\item[BSD] Group of popular UNIX-based operating systems. See \href{https://en.wikipedia.org/wiki/Berkeley_Software_Distribution}{https://en.wikipedia.org/wiki/Berkeley$\_$Software$\_$Distribution}. -\item[compilation] "Translation" of software application from the source code (text readable by human programmer) into binary form launchable by the computer. It requires special tools, and it usually must be done for every operating system and hardware platform. +\item[BSD] Group of popular UNIX-based operating systems. See \href{https://en.wikipedia.org/wiki/Berkeley_Software_Distribution}{https://en.wikipedia.org/wiki/\\Berkeley$\_$Software$\_$Distribution}. +\item[Compilation] "Translation" of software application from the source code (text readable by human programmer) into binary form launchable by the computer. It requires special tools, and it usually must be done for every operating system and hardware platform. \item[Console] See "Shell". -\item[GNU] Major project providing free software widely used in many operating systems, see \href{https://gnu.org/}{https://gnu.org/}. +\item[GNU] Major project providing free software widely used in many operating systems, see \href{https://gnu.org/}{https://\\gnu.org/}. \item[Library] Pack of software tools and functions used by another applications. \item[Linux] One of the most common variants of UNIX-based operating systems. Linux kernel is used by many developers, so that there are plenty of Linux distributions ("flavors") from various sources (e.g. Ubuntu and derivatives, openSUSE, SLE, Debian, Linux Mint, Fedora, Centos, RedHat etc.). They share many features although on the first look they can look differently. See \href{https://en.wikipedia.org/wiki/Linux}{https://en.wikipedia.org/wiki/Linux}. \item[Mac OS X] Popular operating system produced by Apple. The system kernel is based on UNIX, see \href{https://www.apple.com/osx/}{https://www.apple.com/osx/}. \item[openSUSE] Popular Linux distribution, see \href{https://www.opensuse.org/}{https://www.opensuse.org/}. \item[Operating system] Basic system running on your computer -- typically MS Windows (not supported by Sondovač, although it might be working), Mac OS X or some Linux distribution (Ubuntu and derivatives, openSUSE, SLE, Debian, Linux Mint, Fedora, Centos, RedHat etc.). -\item[Parameter(s)] Option(s) passed to any function/command line application to modify its usage. Some can be required, some are optional and some can be used only in particular cases. In case of shell applications, parameters are usually given in way like "application -X", "application -\-parameter", "application -Param SomeValue" and so on. See manual for particular application (e.g. "man application"), in case of Sondovač see page~\pageref{script-usage}. +\item[Parameter(s)] Option(s) passed to any function/command line application to modify its usage. Some can be required, some are optional and some can be used only in particular cases. In case of shell applications, parameters are usually given in way like "application -X", "application -parameter", "application -Param SomeValue" and so on. See manual for particular application (e.g. "man application"), in case of Sondovač see page~\pageref{script-usage}. \item[PATH] Directories in the computer where the system is looking for installed software (in a UNIX-based system you can view it by the command "\texttt{echo \$PATH}"). If you need to modify it manually, see the documentation for your operating system. \item[Script] Software application. It requires an interpreter (application installed on the computer that is able to launch scripts written in a particular language), but the application itself is portable among operating systems and hardware architectures, and it is written in plain text, so that developers can easily modify it. Common examples are Python, Perl or BASH. \item[Shell] "The command line" -- the interface to interact with software using commands typed into the terminal window (See Figure~\ref{terminal}). -\item[Solaris] Popular (mainly on servers) UNIX-based operating system, now developed by Oracle and including several independent clones. See \href{http://distrowatch.com/table.php?distribution=solaris}{http://distrowatch.com/table.php?distribution=solaris}. +\item[Solaris] Popular (mainly on servers) UNIX-based operating system, now developed by Oracle and including several independent clones. See \href{http://distrowatch.com/table.php?distribution=solaris}{http://distrowatch.com/table.php?distribu\\tion=solaris}. \item[Terminal] see "Shell". \item[Ubuntu] Popular Linux distribution, see \href{http://www.ubuntu.com/}{http://www.ubuntu.com/}. \item[Upstream] Developers usually support (e.g. by fixing of bugs) only newer versions of an application. If you use an older version and you encounter problems, no one will probably help you. Moreover, using old versions of software can be a security risk because of security issues fixed in newer versions. @@ -429,7 +432,7 @@ \subsubsection{Input files} \item[\texttt{-c FILE}] Plastome reference sequence input file in FASTA format. \begin{itemize} \item \texttt{sondovac$\_$part$\_$a.sh}, \texttt{sondovac$\_$part$\_$b.sh} - \item Plastome reference sequences from taxa up to the same order of the studied plant group are suitable. See \citet{Straub2012}, \href{http://www.amjbot.org/content/99/2/349.short}{http://www.amjbot.org/content/99/2/349.short}. + \item Plastome reference sequences from taxa up to the same order of the studied plant group are suitable. See \citet{Straub2012}. \end{itemize} \item[\texttt{-m FILE}] Mitochondriome reference sequence input file in FASTA format (optional). \begin{itemize} @@ -514,17 +517,17 @@ \subsection{Input and output files} \item \texttt{*$\_$unique$\_$transcripts.fasta} -- Unique transcripts in FASTA format. \item \texttt{*$\_$genome$\_$skim$\_$data$\_$no$\_$cp$\_$reads.bam} -- SAM converted to BAM (removal of reads of plastid origin). \item \texttt{*$\_$genome$\_$skim$\_$data$\_$no$\_$cp$\_$reads} -- Genome skim data without cpDNA reads. - \item \texttt{*$\_$genome$\_$skim$\_$data$\_$no$\_$cp$\_$no$\_$mt$\_$reads.bam} -- SAM converted to BAM (removal of reads of mitochondrial origin). - \item \texttt{*$\_$genome$\_$skim$\_$data$\_$no$\_$cp$\_$no$\_$mt$\_$reads} -- Genome skim data without mtDNA reads. + \item \texttt{*$\_$genome$\_$skim$\_$data$\_$no$\_$cp$\_$no$\_$mt$\_$reads.bam} -- SAM converted to BAM (removal of reads of mitochondrial origin) -- only if mitochondriome reference sequence was used. + \item \texttt{*$\_$genome$\_$skim$\_$data$\_$no$\_$cp$\_$no$\_$mt$\_$reads} -- Genome skim data without mtDNA reads -- only if mitochondriome reference sequence was used. \item \texttt{*$\_$combined$\_$reads$\_$co$\_$cp$\_$no$\_$mt$\_$reads} -- Combined paired-end genome skim reads. - \item \texttt{*$\_$blat$\_$unique$\_$transcripts$\_$versus$\_$genome$\_$skim$\_$data.pslx} -- Output of BLAT (matching of the unique transcripts and the filtered, combined genome skim reads sharing $\geq$85\% sequence similarity). + \item \texttt{*$\_$blat$\_$unique$\_$transcripts$\_$versus$\_$genome$\_$skim$\_$data.pslx} -- Output of BLAT (ma\-tching of the unique transcripts and the filtered, combined genome skim reads sharing $\geq$85\% sequence similarity). \item \texttt{*$\_$blat$\_$unique$\_$transcripts$\_$versus$\_$genome$\_$skim$\_$data.fasta} -- Matching sequences in FASTA. \item \textbf{\texttt{*$\_$blat$\_$unique$\_$transcripts$\_$versus$\_$genome$\_$skim$\_$data-no$\_$missing$\_$fin.fsa} -- Final FASTA sequences for usage in Geneious.} \end{enumerate} Files 1-9 are not necessary for further processing by this pipeline, but may be useful for the user. The last file (10) is used as input file for Geneious in the next step. -\textbf{Geneious} requires as input the last output file of \texttt{sondovac$\_$part$\_$a.sh} (file 10: \texttt{*$\_$blat$\_$uni\-que$\_$tra\-nscri\-pts$\_$ve\-rsu\-s$\_$ge\-no\-me$\_$skim$\_$data-no$\_$missing$\_$fin.fsa}). Output of Geneious are two exported files (see page~\pageref{geneious}): +\textbf{Geneious} requires as input the last output file of \texttt{sondovac$\_$part$\_$a.sh} (file 10: \texttt{*$\_$blat$\_$\allowbreak unique$\_$\allowbreak transcripts$\_$\allowbreak versus$\_$\allowbreak genome$\_$\allowbreak skim$\_$\allowbreak data-no$\_$\allowbreak missing$\_$\allowbreak fin.fsa}). Output of Geneious are two exported files (see page~\pageref{geneious}): \begin{enumerate} \item Final assembled sequences exported as TSV. @@ -548,6 +551,20 @@ \subsection{Input and output files} \item \textbf{\texttt{*.target$\_$enrichment$\_$probe$\_$sequences$\_$final.pslx} -- Final probes ready for bait synthesis.} \end{enumerate} +\subsection{Record output of Sondovač} + +To record whole output of Sondovač script (regardless parameters used) use utility \texttt{tee}. It will produce plain text output with everything printed to the screen. It can be useful for reference or explorations is something went wrong. Use it as follows: + +\begin{bashcode} + ./sondovac_part_a.sh | tee records.log + man tee # See more options how tee can record script's output + # "|" is a pipe passing output of 1st command as input for 2nd command + less records.log # See the record. Quit viewing by "Q" + rm records.log # Delete the log file +\end{bashcode} + +You can use any command line arguments, script will behave as usually. Plain text file \texttt{records.log} will then contain all its output. + \section{Sample data} Together with the script, we provide the ZIP archive (1.8 GB) that contains exemplary input files for running the script: \textit{Oxalis} genome skim data as well as the \textit{Ricinus} cpDNA and mtDNA reference sequences. See \href{https://github.com/V-Z/sondovac/wiki/Sample-data}{https://github.com/V-Z/sondovac/wiki/Sample-data} for download of sample data. @@ -555,16 +572,16 @@ \section{Sample data} The package contains: \begin{enumerate} - \item \texttt{input2$\_$Ricinus$\_$communis$\_$reference$\_$plastid$\_$genome.fsa} -- cpDNA reference (parameter \texttt{-c}), GenBank reference number \href{http://www.ncbi.nlm.nih.gov/nuccore/372450118/}{NC$\_$016736} - \item \texttt{input3$\_$J12$\_$Oxalis$\_$obtusa$\_$J12$\_$genome$\_$skim$\_$data$\_$R1.fastq} -- paired-end genome skim data, file 1 (parameter \texttt{-t}) - \item \texttt{input4$\_$J12$\_$Oxalis$\_$obtusa$\_$J12$\_$genome$\_$skim$\_$data$\_$R2.fastq} -- paired-end genome skim data, file 2 (parameter \texttt{-q}) - \item \texttt{input5$\_$Ricinus$\_$communis$\_$reference$\_$mitochondrial$\_$genome.fasta} -- mtDNA reference (parameter \texttt{-m}), GenBank reference number \href{http://www.ncbi.nlm.nih.gov/nuccore/323649872/}{NC$\_$015141} + \item \texttt{input2$\_$Ricinus$\_$communis$\_$reference$\_$plastid$\_$genome.fsa} -- cpDNA reference (pa\-ra\-meter \texttt{-c}), GenBank reference number \href{http://www.ncbi.nlm.nih.gov/nuccore/372450118/}{NC$\_$016736}. + \item \texttt{input3$\_$J12$\_$Oxalis$\_$obtusa$\_$J12$\_$genome$\_$skim$\_$data$\_$R1.fastq} -- paired-end genome skim data, file 1 (parameter \texttt{-t}). + \item \texttt{input4$\_$J12$\_$Oxalis$\_$obtusa$\_$J12$\_$genome$\_$skim$\_$data$\_$R2.fastq} -- paired-end genome skim data, file 2 (parameter \texttt{-q}). + \item \texttt{input5$\_$Ricinus$\_$communis$\_$reference$\_$mitochondrial$\_$genome.fasta} -- mtDNA reference (parameter \texttt{-m}), GenBank reference number \href{http://www.ncbi.nlm.nih.gov/nuccore/323649872/}{NC$\_$015141}. \end{enumerate} The transcriptome input file is unpublished data from G. K.-S. Wong et al. (hopefully it will be published soon). \begin{enumerate} - \item \texttt{input1$\_$JHCN$\_$Oxalis$\_$corniculata$\_$transcriptome$\_$data.fa} -- transcriptome data (parameter \texttt{-f}) + \item \texttt{input1$\_$JHCN$\_$Oxalis$\_$corniculata$\_$transcriptome$\_$data.fa} -- transcriptome data (parameter \texttt{-f}). \end{enumerate} Data can be found under @@ -575,7 +592,7 @@ \section{Sample data} \item \href{http://www.onekp.com/samples/single.php?id=JHCN}{http://www.onekp.com/samples/single.php?id=JHCN} \end{itemize} -The transcriptome FASTA file used for the probe design is named JHCN-SOAPdenovo-Trans-assembly.dnas.out and can be found under JHCN/Assembly/JHCN-SOAPdenovo-Trans-translated/. Information about how to get access to data download is given in \citet{Matasci2014}, \href{http://www.gigasciencejournal.com/content/3/1/17/}{http://www.gigasciencejournal.com/content/3/1/17/}. +The transcriptome FASTA file used for the probe design is named JHCN-SOAPdenovo-Trans-assembly.dnas.out and can be found under JHCN/Assembly/JHCN-SOAPdenovo-Trans-translated/. Information about how to get access to data download is given in \citet{Matasci2014}. Explanation of command line parameters is at page~\pageref{script-usage}. @@ -607,7 +624,7 @@ \section{Licenses} \fontsize{7pt}{8pt} \selectfont -\subsection{GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007} +\subsection{GNU General Public License, Version 3, 29 June 2007} Copyright © 2007 Free Software Foundation, Inc. \href{http://fsf.org/}{http://fsf.org/} @@ -635,7 +652,7 @@ \subsubsection{Preamble} The precise terms and conditions for copying, distribution and modification follow. -\subsubsection{TERMS AND CONDITIONS} +\subsubsection{Terms and Conditions} \subsubsection{0. Definitions} @@ -820,7 +837,7 @@ \subsubsection{17. Interpretation of Sections 15 and 16} If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. -\subsection{GNU GENERAL PUBLIC LICENSE, Version 2, June 1991} +\subsection{GNU General Public License, Version 2, June 1991} Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA @@ -844,7 +861,7 @@ \subsubsection{Preamble} The precise terms and conditions for copying, distribution and modification follow. -\subsubsection{TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION} +\subsubsection{Terms and Conditions for Copying, Distribution and Modification} 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The “Program“, below, refers to any such program or work, and a “work based on the Program“ means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term “modification“.) Each licensee is addressed as “you“. @@ -902,13 +919,13 @@ \subsubsection{TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION} 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. -\subsubsection{NO WARRANTY} +\subsubsection{No Warranty} 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS“ WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. -\subsection{GNU AFFERO GENERAL PUBLIC LICENSE, Version 3, 19 November 2007} +\subsection{GNU Affero General Public License, Version 3, 19 November 2007} Copyright © 2007 Free Software Foundation, Inc. \href{http://fsf.org/}{http://fsf.org/} @@ -932,7 +949,7 @@ \subsubsection{Preamble} The precise terms and conditions for copying, distribution and modification follow. -\subsubsection{TERMS AND CONDITIONS} +\subsubsection{Terms and Conditions} \subsubsection{0. Definitions} @@ -1190,7 +1207,7 @@ \subsubsection{9. Accepting Warranty or Additional Liability} END OF TERMS AND CONDITIONS -\subsection{MIT LICENSE} +\subsection{MIT License} Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software“), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: @@ -1201,9 +1218,6 @@ \subsection{MIT LICENSE} \endgroup % Reference list -% Package natbib, style apalike: \cite{} as well as \citet{} > Author (year); \citep{} > (Author, year) -% natbib: \citet/p[text][p.~1]{} > text Author year, p. 1; with * lists all authors -\bibliographystyle{apalike} \bibliography{sondovac_manual} \addcontentsline{toc}{section}{References} diff --git a/sondovac_functions b/sondovac_functions index 91373bf..2b8da2a 100644 --- a/sondovac_functions +++ b/sondovac_functions @@ -2,7 +2,7 @@ # Version of the script SCRIPTVERSION=0.8 -RELEASEDATE=2015-09-09 +RELEASEDATE=2015-10-09 # Web page of the script WEB="https://github.com/V-Z/sondovac/" diff --git a/sondovac_part_a.sh b/sondovac_part_a.sh index 3b2718d..4768d8a 100755 --- a/sondovac_part_a.sh +++ b/sondovac_part_a.sh @@ -29,7 +29,7 @@ STARTINI="I" FLASHM=250 # BLAT -minIdentity between the unique transcriptomes and the genome skim data BLATIDENT=85 -# Remove transcripts with >1000 BLAT scores +# Remove transcripts with >1000 BLAT scores (or another value selected by user) BLATSCORE=1000 # Create empty variables for file names INPUTFILE="" @@ -1104,7 +1104,7 @@ echo "Sorting unique transcripts" { awk '{$1=sprintf("%05d", $1); print $0}' $INPUTTAB | sort > $SORTEDINPUT; } || { echo echo "${BOLD}Error!${NORM} Sorting of unique transcripts failed. Aborting. Check if" - echo "$INPUTFILE is correct FASTA file and check file $INPUTTAB." + echo "$INPUTFILE is correct FASTA file and check if file $INPUTTAB is correct." echo exit 1 } @@ -1133,7 +1133,7 @@ echo awk '{print ">"$1"\n"$2}' $JOINEDTABS > $JOINEDFA || { echo echo "${BOLD}Error!${NORM} Conversion of $JOINEDTS to FASTA failed." - echo "Aborting. Check file $JOINEDTABS." + echo "Aborting. Check if file $JOINEDTABS is correct." echo exit 1 } @@ -1185,7 +1185,7 @@ echo "Converting SAM to BAM. This may take longer time." samtools view -bT $REFERENCECP $BOWTIE2CP > $CPBAM || { echo echo "${BOLD}Error!${NORM} Conversion of SAM to BAM with samtools failed. Aborting." - echo "Check files $REFERENCECP and $BOWTIE2CP." + echo "Check if files $REFERENCECP and $BOWTIE2CP are correct." echo exit 1 } @@ -1247,8 +1247,8 @@ if [ -n "$REFERENCEMT" ]; then echo "Converting SAM to BAM. This may take longer time." samtools view -bT $REFERENCEMT $BOWTIE2MT > $MTBAM || { echo - echo "${BOLD}Error!${NORM} Conversion of SAM to BAM failed. Aborting. Check files" - echo "$REFERENCEMT and $BOWTIE2MT." + echo "${BOLD}Error!${NORM} Conversion of SAM to BAM failed. Aborting. Check if files" + echo "$REFERENCEMT and $BOWTIE2MT are correct." echo exit 1 } @@ -1276,8 +1276,8 @@ if [ -n "$REFERENCEMT" ]; then echo flash -o $FLASHOUT -M $FLASHM `echo $FASTQNOMT`_1.fq `echo $FASTQNOMT`_2.fq || { echo - echo "${BOLD}Error!${NORM} Combining paired-end reads failed. Aborting. Check files" - echo "$REFERENCEMT, `echo $FASTQNOMT`_1.fq and `echo $FASTQNOMT`_2.fq." + echo "${BOLD}Error!${NORM} Combining paired-end reads failed. Aborting. Check if files" + echo "$REFERENCEMT, `echo $FASTQNOMT`_1.fq and `echo $FASTQNOMT`_2.fq are correct." echo exit 1 } @@ -1291,8 +1291,8 @@ if [ -n "$REFERENCEMT" ]; then echo flash -o $FLASHOUT -M $FLASHM `echo $FASTQNOCP`_1.fq `echo $FASTQNOCP`_2.fq || { echo - echo "${BOLD}Error!${NORM} Combining paired-end reads failed. Aborting. Check files" - echo "$REFERENCECP, `echo $FASTQNOCP`_1.fq and `echo $FASTQNOCP`_2.fq." + echo "${BOLD}Error!${NORM} Combining paired-end reads failed. Aborting. Check if files" + echo "$REFERENCECP, `echo $FASTQNOCP`_1.fq and `echo $FASTQNOCP`_2.fq are correct." echo exit 1 } @@ -1371,7 +1371,7 @@ echo exit 1 } -# Remove transcripts with >1000 BLAT scores (very likely that these are repetitive elements) +# Remove transcripts with >1000 BLAT scores (or another value selected by user; very likely that these are repetitive elements) # Count the number of times each transcript hit a genome skim read echo @@ -1384,7 +1384,7 @@ echo "Counting number of times each transcript hit a genom skim read" exit 1 } -# List of the transcripts with >1000 BLAT scores +# List of the transcripts with >1000 BLAT scores (or another value selected by user) echo echo "Listing transcripts with >$BLATSCORE BLAT scores" { awk '$1>'"$BLATSCORE"'' $TABLIST | awk '{print $2}' > $TABBLAT; } || { @@ -1428,7 +1428,7 @@ echo "Converting TAB to FASTA and removing sequences with \"n\"" grep -v n $TABREMOVED | awk '{print $1"\t"length($2)}' | awk '{s+=$2;a++}END{print s}' || { echo echo "${BOLD}Error!${NORM} Removing of transcripts with >$BLATSCORE BLAT score failed. Aborting." - echo "Check file $TABREMOVED." + echo "Check if file $TABREMOVED is correct." echo exit 1 } @@ -1449,31 +1449,49 @@ rm $UNIQUELIST $INPUTTAB $SORTEDINPUT $JOINEDTS $JOINEDTABS $REFERENCECP2* $BOWT # List kept files which user can use for another analysis echo echo "Following files are kept for possible later usage (see manual for details):" -echo "1) Output of BLAT (removal of transcripts sharing ≥90% sequence similarity):" -echo "$BLATOUT" -echo "2) Unique transcripts in FASTA format:" -echo "$JOINEDFA" -echo "3) SAM converted to BAM (removal of reads of plastid origin):" -echo "$CPBAM" -echo "4) Genome skim data without cpDNA reads:" -ls $FASTQNOCP* +echo "================================================================================" if [ -n "$REFERENCEMT" ]; then + echo "1) Output of BLAT (removal of transcripts sharing ≥90% sequence similarity):" + echo "$BLATOUT" + echo "2) Unique transcripts in FASTA format:" + echo "$JOINEDFA" + echo "3) SAM converted to BAM (removal of reads of plastid origin):" + echo "$CPBAM" + echo "4) Genome skim data without cpDNA reads:" + ls $FASTQNOCP* echo "5) SAM converted to BAM (removal of reads of mitochondrial origin):" echo "$MTBAM" echo "6) Genome skim data without mtDNA reads:" ls $FASTQNOMT* + echo "7) Combined paired-end genome skim reads:" + echo "$FLASHOUT.extendedFrags.fa" + echo "8) Output of BLAT (matching of the unique transcripts and the filtered," + echo " combined genome skim reads sharing ≥85% sequence similarity):" + echo "$BLATOUTFIN" + echo "9) Matching sequences in FASTA:" + echo "$BLATOUTFIN2" + echo "10) Final FASTA sequences for usage in Geneious:" + echo "$FINALA" else - echo "(Some files are not available as there was no mitochondriome reference sequence)" + echo "1) Output of BLAT (removal of transcripts sharing ≥90% sequence similarity):" + echo "$BLATOUT" + echo "2) Unique transcripts in FASTA format:" + echo "$JOINEDFA" + echo "3) SAM converted to BAM (removal of reads of plastid origin):" + echo "$CPBAM" + echo "4) Genome skim data without cpDNA reads:" + ls $FASTQNOCP* + echo "5) Combined paired-end genome skim reads:" + echo "$FLASHOUT.extendedFrags.fa" + echo "6) Output of BLAT (matching of the unique transcripts and the filtered," + echo " combined genome skim reads sharing ≥85% sequence similarity):" + echo "$BLATOUTFIN" + echo "7) Matching sequences in FASTA:" + echo "$BLATOUTFIN2" + echo "8) Final FASTA sequences for usage in Geneious:" + echo "$FINALA" fi -echo "7) Combined paired-end genome skim reads:" -echo "$FLASHOUT.extendedFrags.fa" -echo "8) Output of BLAT (matching of the unique transcripts and the filtered," -echo " combined genome skim reads sharing ≥85% sequence similarity):" -echo "$BLATOUTFIN" -echo "9) Matching sequences in FASTA:" -echo "$BLATOUTFIN2" -echo "10) Final FASTA sequences for usage in Geneious:" -echo "$FINALA" +echo "================================================================================" confirmgo echo @@ -1482,6 +1500,7 @@ echo echo "Resulting FASTA was saved as" echo "${BOLD}$FINALA${NORM} for usage in Geneious (step 7 of the pipeline)." echo "Use this file in next step of the pipeline. See README and manual for details." +echo "================================================================================" confirmgo echo "Run Geneious (tested with versions 6, 7 and 8). See README and manual for details." diff --git a/sondovac_part_b.sh b/sondovac_part_b.sh index 4b95c36..27c3109 100755 --- a/sondovac_part_b.sh +++ b/sondovac_part_b.sh @@ -44,7 +44,7 @@ while getopts "hvulrpeinc:x:z:b:d:" START; do echo -e "\t later overwritten." echo echo -e "\tOptions required for running in non-interactive mode:" - echo -e "\t-c\Plastome reference sequence input file in FASTA format." + echo -e "\t-c\tPlastome reference sequence input file in FASTA format." echo -e "\t-x\tInput file in TSV format (output of Geneious assembly)." echo -e "\t-z\tInput file in FASTA format (output of Geneious assembly)." echo @@ -154,9 +154,15 @@ checktools grep # Check if egrep is available checktools egrep +# Check if cut is available +checktools cut + # Check if awk is available checktools awk +# Check if wc is available +checktools wc + # Check if sed is available checktools sed @@ -304,7 +310,9 @@ SEQUENCESPROBES120600FIN="${SEQUENCES%.*}_probes_120-600bp_fin.tab" SEQUENCESPROBES120600MODIF="${SEQUENCES%.*}_probes_120-600bp_modified_fin.tab" SEQUENCESPROBES120600ASSEM="${SEQUENCES%.*}_probes_120-600bp_assembled_fin.fasta" SEQUENCESPROBES120600CONTIG="${SEQUENCES%.*}_probes_120-600bp_contig_fin.fasta" -# Preliminary probe sequences +# Preliminary probe sequences - temporary file - will be deleted +PROBEPRELIM0="${SEQUENCES%.*}_prelim_probe_seq0.fasta" +# Preliminary probe sequences - corrected labels PROBEPRELIM="${SEQUENCES%.*}_prelim_probe_seq.fasta" # Sequence similarity checked by CD-HIT - temporary file - will be deleted PROBEPRELIMCDHIT="${SEQUENCES%.*}_similarity_test" @@ -318,6 +326,8 @@ PROBEPRELIMSORT="${SEQUENCES%.*}_similarity_test_assemblies_sort.tab" PROBEPRELIMFIN="${SEQUENCES%.*}_similarity_test_assemblies_fin.tab" # Probes in FASTA PROBESEQUENCES="${SEQUENCES%.*}_target_enrichment_probe_sequences.fasta" +# Probes in tab - temporary file - will be deleted +PROBESEQUENCESNUM="${SEQUENCES%.*}_target_enrichment_probe_sequences.tab" # Part 3: Assemble the obtained sequences in contigs (part B) @@ -395,16 +405,34 @@ echo # Check the statistics # Check total number of bp echo "Total number of base pairs:" -{ cut -f6 $TSVLIST2 | awk '$1>119' | awk '{s+=$1}END{print s}'; } || { echo && echo "${BOLD}Error!${NORM} Checking statistics failed. Aborting." && echo && exit 1; } +{ cut -f6 $TSVLIST2 | awk '$1>119' | awk '{s+=$1}END{print s}'; } || { + echo + echo "${BOLD}Error!${NORM} Checking statistics failed. Aborting. Check if file" + echo "$TSVLIST2 is correct TSV file containing all required columns:" + echo -e "$REQUIREDCOLS" + echo + exit 1 + } +confirmgo +echo + # Check number of contigs echo "Number of contigs:" -{ cut -f6 $TSVLIST2 | awk '$1>119' | wc -l; } || { echo && echo "${BOLD}Error!${NORM} Checking number of contigs failed. Aborting." && echo && exit 1; } +{ cut -f6 $TSVLIST2 | awk '$1>119' | wc -l; } || { + echo + echo "${BOLD}Error!${NORM} Checking number of contigs failed. Aborting. Check if file" + echo "$TSVLIST2 is correct TSV file containing all required columns" + echo -e "$REQUIREDCOLS" + echo + exit 1 + } +confirmgo echo # Convert FASTA to TSV echo "Converting FASTA to TAB" fasta2tab $SEQUENCES $SEQUENCESTAB || { - echo + echo echo "${BOLD}Error!${NORM} Conversion of FASTA into TAB failed. Aborting." echo "Check if file $SEQUENCES is correct FASTA file." echo @@ -414,47 +442,52 @@ echo # Separate the assembled sequences echo "Separating assembled sequences" -grep 'Contig' $SEQUENCESTAB > $SEQUENCESTABASSE +grep '[Aa]ssembly' $SEQUENCESTAB > $SEQUENCESTABASSE echo # Separate the unassembled sequences echo "Separating unassembled sequences" -grep -v 'Contig' $SEQUENCESTAB > $SEQUENCESTABUNAS +grep -v '[Aa]ssembly' $SEQUENCESTAB > $SEQUENCESTABUNAS echo # Filter the file with the assembled sequences – count the assemblies (the ones indicated with "Contig") making up genes of ≥960 bp / ≥600 bp, comprised of putative exons ≥120 bp -echo "Counting assembled sequences:" -awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed s/_/\\t/g | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>959' | awk '{s+=$3;c++}END{print s}' -awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed s/_/\\t/g | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>959' | wc -l +echo "Number of assembled sequences:" +awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed 's/_/\t/g' | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>959' | awk '{s+=$3;c++}END{print s}' +awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed 's/_/\t/g' | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>959' | wc -l +confirmgo echo # Genes of ≥960 bp (exons ≥120 bp), total bp echo "Genes of ≥960 bp (exons ≥120 bp), total bp:" -awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed s/_/\\t/g | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | awk '{s+=$3;c++}END{print s}' -awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed s/_/\\t/g | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | wc -l +awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed 's/_/\t/g' | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | awk '{s+=$3;c++}END{print s}' +awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed 's/_/\t/g' | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | wc -l +confirmgo echo # Genes of ≥600 bp (exons ≥120 bp), total bp echo "Genes of ≥600 bp (exons ≥120 bp), total bp:" -awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed s/_/\\t/g | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' > $SEQUENCESPROBES600 +awk '{print $1"\t"length($2)}' $SEQUENCESTABASSE | sed 's/_/\t/g' | cut -f6,9 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' > $SEQUENCESPROBES600 +confirmgo echo # Filter the file with the unassembled sequences echo "Filtering the file with the unassembled sequences:" -awk '{print $1"\t"length($2)}' $SEQUENCESTABUNAS | sed s/_/\\t/g | cut -f1,4 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>399' | wc -l +awk '{print $1"\t"length($2)}' $SEQUENCESTABUNAS | sed 's/_/\t/g' | cut -f1,4 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>399' | wc -l +confirmgo echo # Unassembled sequences making up genes of ≥400 bp echo "Unassembled sequences making up genes of ≥400 bp:" -awk '{print $1"\t"length($2)}' $SEQUENCESTABUNAS | sed s/_/\\t/g | cut -f1,4 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | wc -l -awk '{print $1"\t"length($2)}' $SEQUENCESTABUNAS | sed s/_/\\t/g | cut -f1,4 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | awk '{s+=$3;c++}END{print s}' +awk '{print $1"\t"length($2)}' $SEQUENCESTABUNAS | sed 's/_/\t/g' | cut -f1,4 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | wc -l +awk '{print $1"\t"length($2)}' $SEQUENCESTABUNAS | sed 's/_/\t/g' | cut -f1,4 | awk '$2>119' | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | awk '{s+=$3;c++}END{print s}' +confirmgo echo # Part 4: Create the final FASTA file for the Hyb-Seq probes # Extract and sort the assemblies making up genes of ≥600 bp echo "Extracting and sorting the assemblies making up genes of ≥600 bp" -sed s/^/Assembly_/ $SEQUENCESPROBES600 | cut -f1 -d' ' | sort -k1,1 > $SEQUENCESPROBES600FORJOIN +sed 's/^/Assembly_/' $SEQUENCESPROBES600 | cut -f1 -d " " | sort -k1,1 > $SEQUENCESPROBES600FORJOIN echo # Make a file with all exons ≥120 bp @@ -464,7 +497,7 @@ echo # Make the assembly number the first field and sort echo "Sorting exons ≥120 bp" -sed 's/^.*\(Assembly\)/\1/' $SEQUENCESTABASSE120 | sed s/_C/\\tC/ | sort -k1,1 > $SEQUENCESTABASSE120SORT +sed 's/^.*\(Assembly\)/\1/' $SEQUENCESTABASSE120 | sed 's/_C/\tC/' | sort -k1,1 > $SEQUENCESTABASSE120SORT echo # Make a file with all exons ≥120 bp and all assemblies making up genes of ≥600 bp @@ -475,17 +508,23 @@ echo # Convert TAB to FASTA echo "Converting TAB to FASTA" sed 's/ /_/' $SEQUENCESPROBES120600FIN | sed 's/ /_/' > $SEQUENCESPROBES120600MODIF -sed 's/^/>/' $SEQUENCESPROBES120600MODIF | tr " " "\n" > $SEQUENCESPROBES120600ASSEM +sed 's/^/>/' $SEQUENCESPROBES120600MODIF | sed 's/ /\n/' > $SEQUENCESPROBES120600ASSEM + # Remaining assemblies have to be selected and added to the .fasta file of the probes: -grep -v Contig $SEQUENCESTABASSE120 | awk '$2>599' | sed 's/^/>/' | sed 's/\\t/_/' | tr " " "\n" > $SEQUENCESPROBES120600CONTIG +grep -v '[Cc]ontig' $SEQUENCESTABASSE120 | awk '$2>599' | sed 's/^/>/' | sed 's/\t/_/' | sed 's/\t/\n/' > $SEQUENCESPROBES120600CONTIG echo # Combine the two FASTA files echo "Writing FASTA file with preliminary probe sequences" -cat $SEQUENCESPROBES120600ASSEM $SEQUENCESPROBES120600CONTIG > $PROBEPRELIM +cat $SEQUENCESPROBES120600ASSEM $SEQUENCESPROBES120600CONTIG > $PROBEPRELIM0 + +# Ensure all sequences have correct labeling +echo "Ensuring all sequences have correct labels" +sed 's/^>.\+\(Assembly_[0-9]\+_\)/>\1Contig_0_/' $PROBEPRELIM0 > $PROBEPRELIM echo echo "Preliminary probe sequences saved as $PROBEPRELIM for possible later usage" + # Part 5: Make the final quality control of the probe sequences before sending them to company for bait synthesis # Check for sequence similarity between the developed probe sequences with CD-HIT-EST @@ -505,27 +544,30 @@ fasta2tab $PROBEPRELIMCDHIT $PROBEPRELIMCDHIT.txt || { echo # Count all assemblies, comprised of putative exons ≥120 bp -echo "Counting all assemblies, comprised of putative exons ≥120 bp:" +echo "Number of all assemblies, comprised of putative exons ≥120 bp:" awk '{print $1"\t"length($2)}' $PROBEPRELIMCDHIT.txt | awk '{s+=$2;c++}END{print s}' echo +confirmgo # Count the assemblies making up genes of ≥600 bp, comprised of putative exons ≥120 bp -echo "Counting the assemblies making up genes of ≥600 bp, comprised of putative exons ≥120 bp:" -awk '{print $1"\t"length($2)}' $PROBEPRELIMCDHIT.txt | sed s/_/\\t/g | cut -f2,6 | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | awk '{s+=$3;c++}END{print s}' -awk '{print $1"\t"length($2)}' $PROBEPRELIMCDHIT.txt | sed s/_/\\t/g | cut -f2,6 | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | wc -l +echo "Number of the assemblies making up genes of ≥600 bp, comprised of putative exons ≥120 bp:" +awk '{print $1"\t"length($2)}' $PROBEPRELIMCDHIT.txt | sed 's/_/\t/g' | cut -f2,6 | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | awk '{s+=$3;c++}END{print s}' +awk '{print $1"\t"length($2)}' $PROBEPRELIMCDHIT.txt | sed 's/_/\t/g' | cut -f2,6 | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' | wc -l echo -echo "Writing those assemblies into temporal file" -awk '{print $1"\t"length($2)}' $PROBEPRELIMCDHIT.txt | sed s/_/\\t/g | cut -f2,6 | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' > $PROBEPRELIMCDHIT2 +confirmgo + +echo "Writing the assemblies into temporal file" +awk '{print $1"\t"length($2)}' $PROBEPRELIMCDHIT.txt | sed 's/_/\t/g' | cut -f2,6 | awk '{a[$1]++;b[$1]+=$2}END{for (i in a) print i,a[i],b[i]}' | awk '$3>599' > $PROBEPRELIMCDHIT2 echo # Extract and sort the assemblies making up genes of ≥600 bp echo "Extract and sort the assemblies making up genes of ≥600 bp:" -sed s/^/Assembly_/ $PROBEPRELIMCDHIT2 | cut -f1 -d' ' | sort -k1,1 > $PROBEPRELIMFORJOIN +sed 's/^/Assembly_/' $PROBEPRELIMCDHIT2 | cut -f1 -d " " | sort -k1,1 > $PROBEPRELIMFORJOIN echo # Modify the assembly number and sort echo "Modify the assembly number and sort" -sed s/_C/\\tC/ $PROBEPRELIMCDHIT.txt | sort -k1,1 > $PROBEPRELIMSORT +sed 's/_C/\tC/' $PROBEPRELIMCDHIT.txt | sort -k1,1 > $PROBEPRELIMSORT echo # Make a file with all exons ≥120 bp and all assemblies making up genes of ≥600 bp @@ -535,21 +577,43 @@ echo # Convert TAB to FASTA echo "Converting TAB to FASTA" -sed s/' C'/_/ $PROBEPRELIMFIN | sed s/ontig/Contig/ | sed s/^/'>'/ | sed s/' '/\\n/ > $PROBESEQUENCES +sed 's/^\(.\+\) \(Contig\)/>\1_\2/' $PROBEPRELIMFIN | sed 's/ /\n/' > $PROBESEQUENCES echo # Calculating of the total number of base pairs echo "Calculating of the total number of base pairs" echo "Converting FASTA to TAB" -fasta2tab $PROBESEQUENCES $PROBESEQUENCESNUM || { echo && echo "${BOLD}Error!${NORM} Conversion of FASTA into TAB failed. Aborting." && echo && exit 1; } +fasta2tab $PROBESEQUENCES $PROBESEQUENCESNUM || { + echo + echo "${BOLD}Error!${NORM} Conversion of FASTA into TAB failed. Aborting." + echo + exit 1 + } + echo echo "Total number of base pairs:" awk '{print $1"\t"length($2)}' $PROBESEQUENCESNUM | awk '{s+=$2;c++}END{print s}' +confirmgo +echo + +echo "Success!" +echo +echo "Final output file was written as $PROBESEQUENCES" +echo +echo "This file contains the probe sequences." +confirmgo echo # Remove remaining cp/mt genes from probe set echo "Removing remaining cp/mt genes from probe set" -blat -t=dna -q=dna -out=pslx $REFERENCECP $PROBESEQUENCES $PROBESEQUENCES.target_enrichment_probe_sequences_final.pslx +blat -t=dna -q=dna -out=pslx $REFERENCECP $PROBESEQUENCES $PROBESEQUENCES.possible_cp_dna_genes_in_probe_set.pslx +echo + +echo "File $PROBESEQUENCES.possible_cp_dna_genes_in_probe_set.pslx" +echo "contains possible plastid genes in final probe set." +echo "We recommend to remove those genes from final probe set in file" +echo "$PROBESEQUENCES." +confirmgo echo # Remove temporal files @@ -557,13 +621,6 @@ echo "Removing unneeded temporal files" rm $SEQUENCESTAB $SEQUENCESTABASSE $SEQUENCESTABUNAS $SEQUENCESPROBES600 $SEQUENCESPROBES600FORJOIN $SEQUENCESTABASSE120 $SEQUENCESTABASSE120SORT $SEQUENCESPROBES120600FIN $SEQUENCESPROBES120600MODIF $SEQUENCESPROBES120600ASSEM $SEQUENCESPROBES120600CONTIG $PROBEPRELIMCDHIT $PROBEPRELIMFORJOIN $PROBEPRELIMSORT $PROBEPRELIMFIN $PROBESEQUENCESNUM echo -echo "Success!" -echo -echo "Final output file was written as $PROBESEQUENCES.target_enrichment_probe_sequences_final.pslx" -echo -echo "This file contains the probe sequences" -echo - echo "Script exited successfully..." echo