-
Notifications
You must be signed in to change notification settings - Fork 1
/
metabarcodingtextbook2.en.bib
executable file
·445 lines (388 loc) · 42.4 KB
/
metabarcodingtextbook2.en.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
% This file was created with JabRef 2.10.
% Encoding: UTF8
@Misc{Andrews2010,
Title = {Software distributed by the author at http://www.bioinformatics.babraham.ac.uk/projects/fastqc/.},
Author = {Simon Andrews},
Year = {2010},
Owner = {shimotsuki},
Timestamp = {2015.02.02}
}
@Article{Bengtsson2011,
Title = {Metaxa: a software tool for automated detection and discrimination among ribosomal small subunit (12S/16S/18S) sequences of archaea, bacteria, eukaryotes, mitochondria, and chloroplasts in metagenomes and environmental sequencing datasets.},
Author = {Bengtsson, Johan and Eriksson, K Martin and Hartmann, Martin and Wang, Zheng and Shenoy, Belle Damodara and Grelet, Gwen-A{\"{e}}lle and Abarenkov, Kessy and Petri, Anna and Rosenblad, Magnus Alm and Nilsson, R Henrik},
Journal = {Antonie Van Leeuwenhoek},
Year = {2011},
Month = {Oct},
Number = {3},
Pages = {471--475},
Volume = {100},
Abstract = {The ribosomal small subunit (SSU) rRNA gene has emerged as an important genetic marker for taxonomic identification in environmental sequencing datasets. In addition to being present in the nucleus of eukaryotes and the core genome of prokaryotes, the gene is also found in the mitochondria of eukaryotes and in the chloroplasts of photosynthetic eukaryotes. These three sets of genes are conceptually paralogous and should in most situations not be aligned and analyzed jointly. To identify the origin of SSU sequences in complex sequence datasets has hitherto been a time-consuming and largely manual undertaking. However, the present study introduces Metaxa ( http://microbiology.se/software/metaxa/ ), an automated software tool to extract full-length and partial SSU sequences from larger sequence datasets and assign them to an archaeal, bacterial, nuclear eukaryote, mitochondrial, or chloroplast origin. Using data from reference databases and from full-length organelle and organism genomes, we show that Metaxa detects and scores SSU sequences for origin with very low proportions of false positives and negatives. We believe that this tool will be useful in microbial and evolutionary ecology as well as in metagenomics.},
Doi = {10.1007/s10482-011-9598-6},
Institution = {Department of Plant and Environmental Sciences, University of Gothenburg, Göteborg, Sweden. johan@microbiology.se},
Keywords = {Archaea, genetics/isolation /&/ purification; Bacteria, genetics/isolation /&/ purification; Chloroplasts, genetics; Databases, Nucleic Acid; Eukaryota, genetics/isolation /&/ purification; Metagenome; Metagenomics, instrumentation/methods; Mitochondria, genetics; Phylogeny; Ribosome Subunits, Small, genetics; Sequence Alignment; Software},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pmid = {21674231},
Timestamp = {2015.01.30},
Url = {http://dx.doi.org/10.1007/s10482-011-9598-6}
}
@Article{Bengtsson2013,
Title = {Improved software detection and extraction of ITS1 and ITS2 from ribosomal ITS sequences of fungi and other eukaryotes for analysis of environmental sequencing data},
Author = {Bengtsson-Palme, Johan and Ryberg, Martin and Hartmann, Martin and Branco, Sara and Wang, Zheng and Godhe, Anna and De Wit, Pierre and S\'{a}nchez-Garc\'{i}a, Marisol and Ebersberger, Ingo and de Sousa, Filipe and Amend, Anthony and Jumpponen, Ari and Unterseher, Martin and Kristiansson, Erik and Abarenkov, Kessy and Bertrand, Yann J. K. and Sanli, Kemal and Eriksson, K. Martin and Vik, Unni and Veldre, Vilmar and Nilsson, R. Henrik},
Journal = {Methods in Ecology and Evolution},
Year = {2013},
Number = {10},
Pages = {914--919},
Volume = {4},
Doi = {10.1111/2041-210X.12073},
ISSN = {2041-210X},
Keywords = {fungi, molecular ecology, next-generation sequencing, Perl, ribosomal DNA},
Url = {http://dx.doi.org/10.1111/2041-210X.12073}
}
@Article{Edgar2015,
Title = {Error filtering, pair assembly and error correction for next-generation sequencing reads.},
Author = {Edgar, Robert C. and Flyvbjerg, Henrik},
Journal = {Bioinformatics},
Year = {2015},
Month = {Nov},
Number = {21},
Pages = {3476--3482},
Volume = {31},
__markedentry = {[shimotsuki:]},
Abstract = {Next-generation sequencing produces vast amounts of data with errors that are difficult to distinguish from true biological variation when coverage is low.We demonstrate large reductions in error frequencies, especially for high-error-rate reads, by three independent means: (i) filtering reads according to their expected number of errors, (ii) assembling overlapping read pairs and (iii) for amplicon reads, by exploiting unique sequence abundances to perform error correction. We also show that most published paired read assemblers calculate incorrect posterior quality scores.These methods are implemented in the USEARCH package. Binaries are freely available at http://drive5.com/usearch.robert@drive5.comSupplementary data are available at Bioinformatics online.},
Doi = {10.1093/bioinformatics/btv401},
Institution = {Department of Micro- and Nanotechnology, Technical University of Denmark, DK-2800 Lyngby, Denmark.},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {btv401},
Pmid = {26139637},
Timestamp = {2016.01.26},
Url = {http://dx.doi.org/10.1093/bioinformatics/btv401}
}
@Article{Edgar2011,
Title = {UCHIME improves sensitivity and speed of chimera detection.},
Author = {Robert C Edgar and Brian J Haas and Jose C Clemente and Christopher Quince and Rob Knight},
Journal = {Bioinformatics},
Year = {2011},
Month = {Aug},
Number = {16},
Pages = {2194--2200},
Volume = {27},
Abstract = {Chimeric DNA sequences often form during polymerase chain reaction amplification, especially when sequencing single regions (e.g. 16S rRNA or fungal Internal Transcribed Spacer) to assess diversity or compare populations. Undetected chimeras may be misinterpreted as novel species, causing inflated estimates of diversity and spurious inferences of differences between populations. Detection and removal of chimeras is therefore of critical importance in such experiments.We describe UCHIME, a new program that detects chimeric sequences with two or more segments. UCHIME either uses a database of chimera-free sequences or detects chimeras de novo by exploiting abundance data. UCHIME has better sensitivity than ChimeraSlayer (previously the most sensitive database method), especially with short, noisy sequences. In testing on artificial bacterial communities with known composition, UCHIME de novo sensitivity is shown to be comparable to Perseus. UCHIME is >100× faster than Perseus and >1000× faster than ChimeraSlayer.robert@drive5.comSource, binaries and data: http://drive5.com/uchime.Supplementary data are available at Bioinformatics online.},
Doi = {10.1093/bioinformatics/btr381},
Institution = {Tiburon, CA, USA. robert@drive5.com},
Keywords = {Algorithms; Artifacts; Computational Biology; Polymerase Chain Reaction; Sequence Analysis, DNA, methods; Software},
Owner = {shimotsuki},
Pii = {btr381},
Pmid = {21700674},
Timestamp = {2012.05.29},
Url = {http://dx.doi.org/10.1093/bioinformatics/btr381}
}
@Article{Fadrosh2014,
Title = {An improved dual-indexing approach for multiplexed 16S rRNA gene sequencing on the Illumina MiSeq platform.},
Author = {Fadrosh, Douglas W. and Ma, Bing and Gajer, Pawel and Sengamalay, Naomi and Ott, Sandra and Brotman, Rebecca M. and Ravel, Jacques},
Journal = {Microbiome},
Year = {2014},
Number = {1},
Pages = {6},
Volume = {2},
Abstract = {To take advantage of affordable high-throughput next-generation sequencing technologies to characterize microbial community composition often requires the development of improved methods to overcome technical limitations inherent to the sequencing platforms. Sequencing low sequence diversity libraries such as 16S rRNA amplicons has been problematic on the Illumina MiSeq platform and often generates sequences of suboptimal quality.Here we present an improved dual-indexing amplification and sequencing approach to assess the composition of microbial communities from clinical samples using the V3-V4 region of the 16S rRNA gene on the Illumina MiSeq platform. We introduced a 0 to 7 bp "heterogeneity spacer" to the index sequence that allows an equal proportion of samples to be sequenced out of phase.Our approach yields high quality sequence data from 16S rRNA gene amplicons using both 250 bp and 300 bp paired-end MiSeq protocols and provides a flexible and cost-effective sequencing option.},
Doi = {10.1186/2049-2618-2-6},
Institution = {Institute for Genome Sciences, Department of Microbiology and Immunology, University of Maryland School of Medicine, 801 W, Baltimore Street, Baltimore, MD 21201, USA. jravel@som.umaryland.edu.},
Language = {eng},
Medline-pst = {epublish},
Owner = {shimotsuki},
Pii = {2049-2618-2-6},
Pmid = {24558975},
Timestamp = {2015.11.13},
Url = {http://dx.doi.org/10.1186/2049-2618-2-6}
}
@Article{Hamady2008,
Title = {Error-correcting barcoded primers for pyrosequencing hundreds of samples in multiplex.},
Author = {Micah Hamady and Jeffrey J Walker and J. Kirk Harris and Nicholas J Gold and Rob Knight},
Journal = {Nature Methods},
Year = {2008},
Month = {Mar},
Number = {3},
Pages = {235--237},
Volume = {5},
Abstract = {We constructed error-correcting DNA barcodes that allow one run of a massively parallel pyrosequencer to process up to 1,544 samples simultaneously. Using these barcodes we processed bacterial 16S rRNA gene sequences representing microbial communities in 286 environmental samples, corrected 92\% of sample assignment errors, and thus characterized nearly as many 16S rRNA genes as have been sequenced to date by Sanger sequencing.},
Doi = {10.1038/nmeth.1184},
Institution = {Department of Computer Science, UCB 430, University of Colorado, Boulder, Colorado 80309, USA.},
Keywords = {DNA Primers, chemistry; Genetic Code; RNA, Bacterial, chemistry; RNA, Ribosomal, 16S, chemistry; Sequence Analysis, DNA, methods},
Owner = {shimotsuki},
Pii = {nmeth.1184},
Pmid = {18264105},
Timestamp = {2012.05.29},
Url = {http://dx.doi.org/10.1038/nmeth.1184}
}
@Article{Huson2007,
Title = {MEGAN analysis of metagenomic data.},
Author = {Daniel H Huson and Alexander F Auch and Ji Qi and Stephan C Schuster},
Journal = {Genome Research},
Year = {2007},
Month = {Mar},
Number = {3},
Pages = {377--386},
Volume = {17},
Doi = {10.1101/gr.5969107},
Owner = {shimotsuki},
Pii = {gr.5969107},
Pmid = {17255551},
Timestamp = {2012.05.30},
Url = {http://dx.doi.org/10.1101/gr.5969107}
}
@Misc{Illumina2013,
Title = {16S metagenomic sequencing library preparation.},
Author = {{Illumina~corporation}},
Year = {2013},
Owner = {shimotsuki},
Timestamp = {2015.01.30}
}
@Article{Katoh2013,
Title = {MAFFT multiple sequence alignment software version 7: improvements in performance and usability.},
Author = {Katoh, Kazutaka and Standley, Daron M.},
Journal = {Molecular Biology and Evolution},
Year = {2013},
Month = {Apr},
Number = {4},
Pages = {772--780},
Volume = {30},
Abstract = {We report a major update of the MAFFT multiple sequence alignment program. This version has several new features, including options for adding unaligned sequences into an existing alignment, adjustment of direction in nucleotide alignment, constrained alignment and parallel processing, which were implemented after the previous major update. This report shows actual examples to explain how these features work, alone and in combination. Some examples incorrectly aligned by MAFFT are also shown to clarify its limitations. We discuss how to avoid misalignments, and our ongoing efforts to overcome such limitations.},
Doi = {10.1093/molbev/mst010},
Institution = {Immunology Frontier Research Center, Osaka University, Suita, Osaka, Japan. kazutaka.katoh@aist.go.jp},
Keywords = {Algorithms; Amino Acid Sequence; Base Sequence; DNA, Fungal, genetics; DNA, Ribosomal Spacer, genetics; DNA, Ribosomal, genetics; Fungi, genetics; Humans; Models, Genetic; Molecular Sequence Data; Phylogeny; Protein Structure, Tertiary; Quality Improvement; RNA, Bacterial, genetics; Ribonucleases, chemistry/genetics; Ribosome Subunits, Small, Bacterial, genetics; Sequence Alignment, methods; Software},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {mst010},
Pmid = {23329690},
Timestamp = {2015.02.05},
Url = {http://dx.doi.org/10.1093/molbev/mst010}
}
@Article{Kunin2010,
Title = {Wrinkles in the rare biosphere: pyrosequencing errors can lead to artificial inflation of diversity estimates.},
Author = {Victor Kunin and Anna Engelbrektson and Howard Ochman and Philip Hugenholtz},
Journal = {Environmental Microbiology},
Year = {2010},
Month = {Jan},
Number = {1},
Pages = {118--123},
Volume = {12},
Abstract = {Massively parallel pyrosequencing of the small subunit (16S) ribosomal RNA gene has revealed that the extent of rare microbial populations in several environments, the 'rare biosphere', is orders of magnitude higher than previously thought. One important caveat with this method is that sequencing error could artificially inflate diversity estimates. Although the per-base error of 16S rDNA amplicon pyrosequencing has been shown to be as good as or lower than Sanger sequencing, no direct assessments of pyrosequencing errors on diversity estimates have been reported. Using only Escherichia coli MG1655 as a reference template, we find that 16S rDNA diversity is grossly overestimated unless relatively stringent read quality filtering and low clustering thresholds are applied. In particular, the common practice of removing reads with unresolved bases and anomalous read lengths is insufficient to ensure accurate estimates of microbial diversity. Furthermore, common and reproducible homopolymer length errors can result in relatively abundant spurious phylotypes further confounding data interpretation. We suggest that stringent quality-based trimming of 16S pyrotags and clustering thresholds no greater than 97\% identity should be used to avoid overestimates of the rare biosphere.},
Doi = {10.1111/j.1462-2920.2009.02051.x},
Institution = {Microbial Ecology Program, DOE Joint Genome Institute, Walnut Creek, CA 94598, USA.},
Keywords = {Biodiversity; Cluster Analysis; DNA, Bacterial, genetics; Escherichia coli, genetics; Genes, Bacterial; Genetic Variation; RNA, Ribosomal, 16S, genetics; Sequence Alignment; Sequence Analysis, DNA, methods},
Owner = {shimotsuki},
Pii = {EMI2051},
Pmid = {19725865},
Timestamp = {2012.05.29},
Url = {http://dx.doi.org/10.1111/j.1462-2920.2009.02051.x}
}
@Article{Lange2015,
Title = {AmpliconDuo: A Split-Sample Filtering Protocol for High-Throughput Amplicon Sequencing of Microbial Communities.},
Author = {Lange, Anja and Jost, Steffen and Heider, Dominik and Bock, Christina and Budeus, Bettina and Schilling, Elmar and Strittmatter, Axel and Boenigk, Jens and Hoffmann, Daniel},
Journal = {PLoS One},
Year = {2015},
Number = {11},
Pages = {e0141590},
Volume = {10},
Abstract = {High throughput sequencing (HTSeq) of small ribosomal subunit amplicons has the potential for a comprehensive characterization of microbial community compositions, down to rare species. However, the error-prone nature of the multi-step experimental process requires that the resulting raw sequences are subjected to quality control procedures. These procedures often involve an abundance cutoff for rare sequences or clustering of sequences, both of which limit genetic resolution. Here we propose a simple experimental protocol that retains the high genetic resolution granted by HTSeq methods while effectively removing many low abundance sequences that are likely due to PCR and sequencing errors. According to this protocol, we split samples and submit both halves to independent PCR and sequencing runs. The resulting sequence data is graphically and quantitatively characterized by the discordance between the two experimental branches, allowing for a quick identification of problematic samples. Further, we discard sequences that are not found in both branches ("AmpliconDuo filter"). We show that the majority of sequences removed in this way, mostly low abundance but also some higher abundance sequences, show features expected from random modifications of true sequences as introduced by PCR and sequencing errors. On the other hand, the filter retains many low abundance sequences observed in both branches and thus provides a more reliable census of the rare biosphere. We find that the AmpliconDuo filter increases biological resolution as it increases apparent community similarity between biologically similar communities, while it does not affect apparent community similarities between biologically dissimilar communities. The filter does not distort overall apparent community compositions. Finally, we quantitatively explain the effect of the AmpliconDuo filter by a simple mathematical model.},
Doi = {10.1371/journal.pone.0141590},
Institution = {Research Group Bioinformatics, Faculty of Biology, University of Duisburg-Essen, Essen, Germany.},
Language = {eng},
Medline-pst = {epublish},
Owner = {shimotsuki},
Pii = {PONE-D-15-28442},
Pmid = {26523925},
Timestamp = {2015.11.13},
Url = {http://dx.doi.org/10.1371/journal.pone.0141590}
}
@Article{Larkin2007,
Title = {Clustal W and Clustal X version 2.0.},
Author = {Larkin, M. A. and Blackshields, G. and Brown, N. P. and Chenna, R. and McGettigan, P. A. and McWilliam, H. and Valentin, F. and Wallace, I. M. and Wilm, A. and Lopez, R. and Thompson, J. D. and Gibson, T. J. and Higgins, D. G.},
Journal = {Bioinformatics},
Year = {2007},
Month = {Nov},
Number = {21},
Pages = {2947--2948},
Volume = {23},
Abstract = {The Clustal W and Clustal X multiple sequence alignment programs have been completely rewritten in C++. This will facilitate the further development of the alignment algorithms in the future and has allowed proper porting of the programs to the latest versions of Linux, Macintosh and Windows operating systems.The programs can be run on-line from the EBI web server: http://www.ebi.ac.uk/tools/clustalw2. The source code and executables for Windows, Linux and Macintosh computers are available from the EBI ftp site ftp://ftp.ebi.ac.uk/pub/software/clustalw2/},
Doi = {10.1093/bioinformatics/btm404},
Institution = {The Conway Institute of Biomolecular and Biomedical Research, University College Dublin, Belfield, Dublin 4, Ireland.},
Keywords = {Algorithms; Amino Acid Sequence; Cluster Analysis; Computer Graphics; Molecular Sequence Data; Programming Languages; Sequence Alignment, methods; Sequence Analysis, Protein, methods; Software; User-Computer Interface},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {btm404},
Pmid = {17846036},
Timestamp = {2015.02.05},
Url = {http://dx.doi.org/10.1093/bioinformatics/btm404}
}
@Article{Li2012,
Title = {Ultrafast clustering algorithms for metagenomic sequence analysis.},
Author = {Li, Weizhong and Fu, Limin and Niu, Beifang and Wu, Sitao and Wooley, John},
Journal = {Briefings in Bioinformatics},
Year = {2012},
Month = {Nov},
Number = {6},
Pages = {656--668},
Volume = {13},
Abstract = {The rapid advances of high-throughput sequencing technologies dramatically prompted metagenomic studies of microbial communities that exist at various environments. Fundamental questions in metagenomics include the identities, composition and dynamics of microbial populations and their functions and interactions. However, the massive quantity and the comprehensive complexity of these sequence data pose tremendous challenges in data analysis. These challenges include but are not limited to ever-increasing computational demand, biased sequence sampling, sequence errors, sequence artifacts and novel sequences. Sequence clustering methods can directly answer many of the fundamental questions by grouping similar sequences into families. In addition, clustering analysis also addresses the challenges in metagenomics. Thus, a large redundant data set can be represented with a small non-redundant set, where each cluster can be represented by a single entry or a consensus. Artifacts can be rapidly detected through clustering. Errors can be identified, filtered or corrected by using consensus from sequences within clusters.},
Doi = {10.1093/bib/bbs035},
Institution = {Center for Research in Biological Systems, University of California San Diego, USA. liwz@sdsc.edu},
Keywords = {Algorithms; Cluster Analysis; Metagenome; Metagenomics; Sequence Analysis, DNA},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {bbs035},
Pmid = {22772836},
Timestamp = {2014.09.11},
Url = {http://dx.doi.org/10.1093/bib/bbs035}
}
@Article{Magoc2011,
Title = {FLASH: fast length adjustment of short reads to improve genome assemblies.},
Author = {Mago{\v{c}}, Tanja and Salzberg, Steven L.},
Journal = {Bioinformatics},
Year = {2011},
Month = {Nov},
Number = {21},
Pages = {2957--2963},
Volume = {27},
__markedentry = {[shimotsuki:6]},
Abstract = {Next-generation sequencing technologies generate very large numbers of short reads. Even with very deep genome coverage, short read lengths cause problems in de novo assemblies. The use of paired-end libraries with a fragment size shorter than twice the read length provides an opportunity to generate much longer reads by overlapping and merging read pairs before assembling a genome.We present FLASH, a fast computational tool to extend the length of short reads by overlapping paired-end reads from fragment libraries that are sufficiently short. We tested the correctness of the tool on one million simulated read pairs, and we then applied it as a pre-processor for genome assemblies of Illumina reads from the bacterium Staphylococcus aureus and human chromosome 14. FLASH correctly extended and merged reads >99\% of the time on simulated reads with an error rate of <1\%. With adequately set parameters, FLASH correctly merged reads over 90\% of the time even when the reads contained up to 5\% errors. When FLASH was used to extend reads prior to assembly, the resulting assemblies had substantially greater N50 lengths for both contigs and scaffolds.The FLASH system is implemented in C and is freely available as open-source code at http://www.cbcb.umd.edu/software/flash.t.magoc@gmail.com.},
Doi = {10.1093/bioinformatics/btr507},
Institution = {McKusick-Nathans Institute of Genetic Medicine, Johns Hopkins University School of Medicine, Baltimore, MD 21205, USA. t.magoc@gmail.com},
Keywords = {Chromosomes, Human, Pair 14; Genome; Genomics, methods; Humans; Sequence Analysis, DNA; Software; Staphylococcus aureus, genetics},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {btr507},
Pmid = {21903629},
Timestamp = {2016.01.26},
Url = {http://dx.doi.org/10.1093/bioinformatics/btr507}
}
@Article{Masella2012,
Title = {PANDAseq: paired-end assembler for illumina sequences.},
Author = {Masella, Andre P. and Bartram, Andrea K. and Truszkowski, Jakub M. and Brown, Daniel G. and Neufeld, Josh D.},
Journal = {BMC Bioinformatics},
Year = {2012},
Pages = {31},
Volume = {13},
Abstract = {Illumina paired-end reads are used to analyse microbial communities by targeting amplicons of the 16S rRNA gene. Publicly available tools are needed to assemble overlapping paired-end reads while correcting mismatches and uncalled bases; many errors could be corrected to obtain higher sequence yields using quality information.PANDAseq assembles paired-end reads rapidly and with the correction of most errors. Uncertain error corrections come from reads with many low-quality bases identified by upstream processing. Benchmarks were done using real error masks on simulated data, a pure source template, and a pooled template of genomic DNA from known organisms. PANDAseq assembled reads more rapidly and with reduced error incorporation compared to alternative methods.PANDAseq rapidly assembles sequences and scales to billions of paired-end reads. Assembly of control libraries showed a 4-50\% increase in the number of assembled sequences over naive assembly with negligible loss of "good" sequence.},
Doi = {10.1186/1471-2105-13-31},
Institution = {Department of Biology, University of Waterloo, Waterloo, Ontario, Canada.},
Keywords = {Bacteria, genetics/isolation /&/ purification; Metagenomics; RNA, Bacterial, genetics; RNA, Ribosomal, 16S, genetics; Software},
Language = {eng},
Medline-pst = {epublish},
Owner = {shimotsuki},
Pii = {1471-2105-13-31},
Pmid = {22333067},
Timestamp = {2013.05.14},
Url = {http://dx.doi.org/10.1186/1471-2105-13-31}
}
@Article{Nelson2014,
Title = {Analysis, optimization and verification of Illumina-generated 16S rRNA gene amplicon surveys.},
Author = {Nelson, Michael C. and Morrison, Hilary G. and Benjamino, Jacquelynn and Grim, Sharon L. and Graf, Joerg},
Journal = {PLoS One},
Year = {2014},
Number = {4},
Pages = {e94249},
Volume = {9},
Abstract = {The exploration of microbial communities by sequencing 16S rRNA genes has expanded with low-cost, high-throughput sequencing instruments. Illumina-based 16S rRNA gene sequencing has recently gained popularity over 454 pyrosequencing due to its lower costs, higher accuracy and greater throughput. Although recent reports suggest that Illumina and 454 pyrosequencing provide similar beta diversity measures, it remains to be demonstrated that pre-existing 454 pyrosequencing workflows can transfer directly from 454 to Illumina MiSeq sequencing by simply changing the sequencing adapters of the primers. In this study, we modified 454 pyrosequencing primers targeting the V4-V5 hyper-variable regions of the 16S rRNA gene to be compatible with Illumina sequencers. Microbial communities from cows, humans, leeches, mice, sewage, and termites and a mock community were analyzed by 454 and MiSeq sequencing of the V4-V5 region and MiSeq sequencing of the V4 region. Our analysis revealed that reference-based OTU clustering alone introduced biases compared to de novo clustering, preventing certain taxa from being observed in some samples. Based on this we devised and recommend an analysis pipeline that includes read merging, contaminant filtering, and reference-based clustering followed by de novo OTU clustering, which produces diversity measures consistent with de novo OTU clustering analysis. Low levels of dataset contamination with Illumina sequencing were discovered that could affect analyses that require highly sensitive approaches. While moving to Illumina-based sequencing platforms promises to provide deeper insights into the breadth and function of microbial diversity, our results show that care must be taken to ensure that sequencing and processing artifacts do not obscure true microbial diversity.},
Doi = {10.1371/journal.pone.0094249},
Institution = {Department of Molecular and Cell Biology, University of Connecticut, Storrs, Connecticut, United States of America.},
Keywords = {Animals; Artifacts; Bacteria, genetics; Cattle; Cluster Analysis; Computational Biology; DNA, Bacterial, genetics; Databases, Factual; Gene Library; Genes, rRNA; High-Throughput Nucleotide Sequencing, methods; Humans; Isoptera; Leeches; Mice; Principal Component Analysis; RNA, Ribosomal, 16S, genetics; Reproducibility of Results; Sequence Analysis, RNA, methods},
Language = {eng},
Medline-pst = {epublish},
Owner = {shimotsuki},
Pii = {PONE-D-13-49866},
Pmid = {24722003},
Timestamp = {2015.11.13},
Url = {http://dx.doi.org/10.1371/journal.pone.0094249}
}
@Article{Schmieder2011,
Title = {Quality control and preprocessing of metagenomic datasets.},
Author = {Schmieder, Robert and Edwards, Robert},
Journal = {Bioinformatics},
Year = {2011},
Month = {Mar},
Number = {6},
Pages = {863--864},
Volume = {27},
Abstract = {Here, we present PRINSEQ for easy and rapid quality control and data preprocessing of genomic and metagenomic datasets. Summary statistics of FASTA (and QUAL) or FASTQ files are generated in tabular and graphical form and sequences can be filtered, reformatted and trimmed by a variety of options to improve downstream analysis.This open-source application was implemented in Perl and can be used as a stand alone version or accessed online through a user-friendly web interface. The source code, user help and additional information are available at http://prinseq.sourceforge.net/.},
Doi = {10.1093/bioinformatics/btr026},
Institution = {Department of Computer Science, Computational Science Research Center, San Diego State University, San Diego, CA 92182, USA. rschmied@sciences.sdsu.edu},
Keywords = {Computer Graphics; Information Storage and Retrieval, methods; Internet; Metagenomics; Programming Languages; Quality Control; Sequence Analysis, DNA, methods; Software},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {btr026},
Pmid = {21278185},
Timestamp = {2015.02.02},
Url = {http://dx.doi.org/10.1093/bioinformatics/btr026}
}
@Article{Stevens2013,
Title = {Slowing PCR ramp speed reduces chimera formation from environmental samples.},
Author = {Stevens, Julia L. and Jackson, Ronneshia L. and Olson, Julie B.},
Journal = {Journal of Microbiological Methods},
Year = {2013},
Month = {Jun},
Number = {3},
Pages = {203--205},
Volume = {93},
Abstract = {Chimeric sequences falsely increase the apparent diversity within samples. To examine chimera formation in PCR products from environmental DNA, clone libraries were prepared using different ramp speeds to reach the designated temperatures for each step of the PCR program. Slowing the thermocycler ramp speed to 1 °C s(-1) reduced chimera formation.},
Doi = {10.1016/j.mimet.2013.03.013},
Institution = {Department of Biological Sciences, Campus Box 870344, University of Alabama, Tuscaloosa, AL 35487, United States.},
Keywords = {Artifacts; Environmental Microbiology; Molecular Biology, methods; Molecular Sequence Data; Polymerase Chain Reaction, methods; Sequence Analysis, DNA; Specimen Handling, methods; Temperature; Time Factors},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {S0167-7012(13)00107-3},
Pmid = {23541956},
Timestamp = {2015.01.29},
Url = {http://dx.doi.org/10.1016/j.mimet.2013.03.013}
}
@Article{Tanabe2013,
Title = {Two new computational methods for universal DNA barcoding: a benchmark using barcode sequences of bacteria, archaea, animals, fungi, and land plants.},
Author = {Tanabe, Akifumi S. and Toju, Hirokazu},
Journal = {PLoS One},
Year = {2013},
Number = {10},
Pages = {e76910},
Volume = {8},
Abstract = {Taxonomic identification of biological specimens based on DNA sequence information (a.k.a. DNA barcoding) is becoming increasingly common in biodiversity science. Although several methods have been proposed, many of them are not universally applicable due to the need for prerequisite phylogenetic/machine-learning analyses, the need for huge computational resources, or the lack of a firm theoretical background. Here, we propose two new computational methods of DNA barcoding and show a benchmark for bacterial/archeal 16S, animal COX1, fungal internal transcribed spacer, and three plant chloroplast (rbcL, matK, and trnH-psbA) barcode loci that can be used to compare the performance of existing and new methods. The benchmark was performed under two alternative situations: query sequences were available in the corresponding reference sequence databases in one, but were not available in the other. In the former situation, the commonly used "1-nearest-neighbor" (1-NN) method, which assigns the taxonomic information of the most similar sequences in a reference database (i.e., BLAST-top-hit reference sequence) to a query, displays the highest rate and highest precision of successful taxonomic identification. However, in the latter situation, the 1-NN method produced extremely high rates of misidentification for all the barcode loci examined. In contrast, one of our new methods, the query-centric auto-k-nearest-neighbor (QCauto) method, consistently produced low rates of misidentification for all the loci examined in both situations. These results indicate that the 1-NN method is most suitable if the reference sequences of all potentially observable species are available in databases; otherwise, the QCauto method returns the most reliable identification results. The benchmark results also indicated that the taxon coverage of reference sequences is far from complete for genus or species level identification in all the barcode loci examined. Therefore, we need to accelerate the registration of reference barcode sequences to apply high-throughput DNA barcoding to genus or species level identification in biodiversity research.},
Doi = {10.1371/journal.pone.0076910},
Institution = {Graduate School of Global Environmental Studies, Kyoto University, Kyoto, Kyoto, Japan ; Research Center for Aquatic Genomics, National Research Institute of Fisheries Science, Fisheries Research Agency, Yokohama, Kanagawa, Japan.},
Keywords = {Animals; Archaea, classification/genetics; Bacteria, classification/genetics; Benchmarking, methods; Computational Biology, methods; DNA Barcoding, Taxonomic, methods; DNA, Ribosomal Spacer, genetics; DNA, analysis/genetics; Electron Transport Complex IV, genetics; Embryophyta, classification/genetics; Fungi, classification/genetics; Genes, Chloroplast, genetics; RNA, Ribosomal, 16S, genetics; Reproducibility of Results},
Language = {eng},
Medline-pst = {epublish},
Owner = {shimotsuki},
Pii = {PONE-D-13-21549},
Pmid = {24204702},
Timestamp = {2015.01.30},
Url = {http://dx.doi.org/10.1371/journal.pone.0076910}
}
@Article{Zhang2014,
Title = {PEAR: a fast and accurate Illumina Paired-End reAd mergeR.},
Author = {Zhang, Jiajie and Kobert, Kassian and Flouri, Tom{\'{a}}{\v{s}} and Stamatakis, Alexandros},
Journal = {Bioinformatics},
Year = {2014},
Month = {Mar},
Number = {5},
Pages = {614--620},
Volume = {30},
Abstract = {The Illumina paired-end sequencing technology can generate reads from both ends of target DNA fragments, which can subsequently be merged to increase the overall read length. There already exist tools for merging these paired-end reads when the target fragments are equally long. However, when fragment lengths vary and, in particular, when either the fragment size is shorter than a single-end read, or longer than twice the size of a single-end read, most state-of-the-art mergers fail to generate reliable results. Therefore, a robust tool is needed to merge paired-end reads that exhibit varying overlap lengths because of varying target fragment lengths.We present the PEAR software for merging raw Illumina paired-end reads from target fragments of varying length. The program evaluates all possible paired-end read overlaps and does not require the target fragment size as input. It also implements a statistical test for minimizing false-positive results. Tests on simulated and empirical data show that PEAR consistently generates highly accurate merged paired-end reads. A highly optimized implementation allows for merging millions of paired-end reads within a few minutes on a standard desktop computer. On multi-core architectures, the parallel version of PEAR shows linear speedups compared with the sequential version of PEAR.PEAR is implemented in C and uses POSIX threads. It is freely available at http://www.exelixis-lab.org/web/software/pear.},
Doi = {10.1093/bioinformatics/btt593},
Institution = {The Exelixis Lab, Scientific Computing Group, Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Graduate School for Computing in Medicine and Life Sciences, Institut für Neuro- und Bioinformatik, University of Lübeck, 23538 Lübeck and Karlsruhe Institute of Technology, Institute for Theoretical Informatics, Postfach 6980, 76128 Karlsruhe, Germany.},
Keywords = {Genomics; High-Throughput Nucleotide Sequencing, methods; Sequence Alignment; Sequence Analysis, DNA, methods; Software; Staphylococcus aureus, genetics},
Language = {eng},
Medline-pst = {ppublish},
Owner = {shimotsuki},
Pii = {btt593},
Pmid = {24142950},
Timestamp = {2015.01.30},
Url = {http://dx.doi.org/10.1093/bioinformatics/btt593}
}