diff --git a/2024/ebaiin1/chip-seq/README.md b/2024/ebaiin1/chip-seq/README.md new file mode 100644 index 0000000..df293f7 --- /dev/null +++ b/2024/ebaiin1/chip-seq/README.md @@ -0,0 +1,38 @@ +## EBAII 2024 - ChIP-seq course + +### Hands-on +Go to [hands-on](hands-on/hands-on.html) + +### Planning + +#### Tuesday + + +| **Start** | **End** | **Duration** | **Topics** | **Teacher** | +| -------- | --------- | --------- | ----------- | ----------- | +| 10:45 | 12:45 | 02:00 | Experimental design | Stéphanie Le Gras | +| 12:45 | 14:30 | 01:45 | **Lunch Break** | | +| 14:30 | 16:00 | 01:30 | Mapping | Stéphanie Le Gras | +| 16:00 | 16:30 | 00:30 | **Break** | | +| 16:30 | 17:45 | 01:15 | Mapping QC, Visualization | Tao Ye | +| 17:45 | 19:00 | 01:15 | Mapping QC, Visualization, Peak calling | Tao Ye | + + +#### Wednesday + + +| **Start** | **End** | **Duration** | **Topics** | **Teacher** | +| -------- | --------- | --------- | ----------- | ----------- | +| 10:45 | 12:25 | 02:00 | Motif analysis | Morgane THOMAS-CHOLLIER | +| 12:45 | 14:30 | 01:45 | **Break** | | +| 14:30 | 16:00 | 01:30 | Free time | +| 16:00 | 16:30 | 00:30 | **Break** | | +| 16:30 | 17:45 | 01:15 | Peak annotation | Elodie Darbo | + + +#### Thursday + + +| **Start** | **End** | **Duration** | **Topics** | **Teacher** | +| -------- | --------- | --------- | ----------- | ----------- | +| 8:30 | 10:15 | 01:45 | Scripting / Workflow | Elodie Darbo | diff --git a/2024/ebaiin1/chip-seq/_config.yml b/2024/ebaiin1/chip-seq/_config.yml new file mode 100644 index 0000000..c419263 --- /dev/null +++ b/2024/ebaiin1/chip-seq/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-cayman \ No newline at end of file diff --git a/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348064_p300_peaks.txt.gz b/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348064_p300_peaks.txt.gz new file mode 100644 index 0000000..7babcc1 Binary files /dev/null and b/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348064_p300_peaks.txt.gz differ diff --git a/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348065_p300_peaks.txt.gz b/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348065_p300_peaks.txt.gz new file mode 100644 index 0000000..7dcf5b9 Binary files /dev/null and b/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348065_p300_peaks.txt.gz differ diff --git a/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348066_p300_peaks.txt.gz b/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348066_p300_peaks.txt.gz new file mode 100644 index 0000000..1f4123f Binary files /dev/null and b/2024/ebaiin1/chip-seq/hands-on/07-PeakAnnotation-bonus/GSM348066_p300_peaks.txt.gz differ diff --git a/2024/ebaiin1/chip-seq/hands-on/cmd.sh b/2024/ebaiin1/chip-seq/hands-on/cmd.sh new file mode 100644 index 0000000..b71c64d --- /dev/null +++ b/2024/ebaiin1/chip-seq/hands-on/cmd.sh @@ -0,0 +1,139 @@ +## Création de l'environnement +mkdir M2.2-BIMS-epigenomique +cd M2.2-BIMS-epigenomique +cp -r ../EBAII2021_chipseq/data . + +## Controle qualité +module add fastqc/0.11.9 + +mkdir 01-QualityControl +cd 01-QualityControl +fastqc ../data/SRR576933.fastq.gz -o . +fastqc ../data/SRR576934.fastq.gz -o . +fastqc ../data/SRR576938.fastq.gz -o . +cd .. + +## Mapping +module add bowtie/1.2.3 + +mkdir 02-Mapping +cd 02-Mapping +mkdir index +cd index +bowtie-build ../../data/Escherichia_coli_K12.fasta Escherichia_coli_K12 +cd .. +mkdir bam +cd bam +sbatch --cpus-per-task 10 --wrap="bowtie -p 10 ../index/Escherichia_coli_K12 ../../data/SRR576938.fastq.gz -v 2 -m 1 -3 1 -S 2> SRR576938.out > SRR576938.sam" +sbatch --cpus-per-task 10 --wrap="bowtie -p 10 ../index/Escherichia_coli_K12 ../../data/SRR576934.fastq.gz -v 2 -m 1 -3 1 -S 2> SRR576934.out > SRR576934.sam" +sbatch --cpus-per-task 10 --wrap="bowtie -p 10 ../index/Escherichia_coli_K12 ../../data/SRR576933.fastq.gz -v 2 -m 1 -3 1 -S 2> SRR576933.out > SRR576933.sam" + +# Création de fichiers sam ordonnés +module add samtools/1.10 +samtools sort SRR576933.sam | samtools view -b > SRR576933.bam +samtools sort SRR576934.sam | samtools view -b > SRR576934.bam +samtools sort SRR576938.sam | samtools view -b > SRR576938.bam + +# On index les fichiers bam +samtools index SRR576934.bam +samtools index SRR576933.bam +samtools index SRR576938.bam + +# On compresse les fichiers sam +gzip SRR576933.sam & +gzip SRR576934.sam & +gzip SRR576938.sam & + +# marquage des duplicats +module add picard/2.22.0 + +picard MarkDuplicates CREATE_INDEX=true INPUT=SRR576933.bam OUTPUT=Marked_SRR576933.bam METRICS_FILE=metrics VALIDATION_STRINGENCY=STRICT +picard MarkDuplicates CREATE_INDEX=true INPUT=SRR576934.bam OUTPUT=Marked_SRR576934.bam METRICS_FILE=metrics VALIDATION_STRINGENCY=STRICT +picard MarkDuplicates CREATE_INDEX=true INPUT=SRR576938.bam OUTPUT=Marked_SRR576938.bam METRICS_FILE=metrics VALIDATION_STRINGENCY=STRICT + +cd ../.. + +## Control qualité +module add deeptools/3.2.0 + +mkdir 03-ChIPQualityControls +cd 03-ChIPQualityControls +plotFingerprint --numberOfSamples 10000 -b ../02-Mapping/bam/SRR576933.bam ../02-Mapping/bam/SRR576934.bam ../02-Mapping/bam/SRR576938.bam -plot fingerprint_10000.png & +plotFingerprint -b ../02-Mapping/bam/SRR576933.bam ../02-Mapping/bam/SRR576934.bam ../02-Mapping/bam/SRR576938.bam -plot fingerprint.png & + +cd .. + +## Génération de fichiers bigwig +module add deeptools/3.2.0 + +mkdir 04-Visualization +cd 04-Visualization/ +bamCoverage --bam ../02-Mapping/bam/Marked_SRR576933.bam --outFileName SRR576933_nodup.bw --outFileFormat bigwig --effectiveGenomeSize 4639675 --normalizeUsing RPGC --skipNonCoveredRegions --extendReads 200 --ignoreDuplicates +bamCoverage --bam ../02-Mapping/bam/Marked_SRR576934.bam --outFileName SRR576934_nodup.bw --outFileFormat bigwig --effectiveGenomeSize 4639675 --normalizeUsing RPGC --skipNonCoveredRegions --extendReads 200 --ignoreDuplicates +bamCoverage --bam ../02-Mapping/bam/Marked_SRR576938.bam --outFileName SRR576938_nodup.bw --outFileFormat bigwig --effectiveGenomeSize 4639675 --normalizeUsing RPGC --skipNonCoveredRegions --extendReads 200 --ignoreDuplicates + +cd .. + +## Peak calling +module add macs2/2.2.7.1 + +mkdir 05-PeakCalling +# Peak calling sur les réplicats +mkdir 05-PeakCalling/replicates +cd 05-PeakCalling/replicates +macs2 callpeak -t ../../02-Mapping/bam/SRR576933.bam -c ../../02-Mapping/bam/SRR576938.bam --format BAM --gsize 4639675 --name 'FNR_Anaerobic_A' --bw 400 --fix-bimodal -p 1e-2 &> repA_MACS.out +macs2 callpeak -t ../../02-Mapping/bam/SRR576934.bam -c ../../02-Mapping/bam/SRR576938.bam --format BAM --gsize 4639675 --name 'FNR_Anaerobic_B' --bw 400 --fix-bimodal -p 1e-2 &> repB_MACS.out +cd .. + +# Peak calling sur le pool de réplicat +mkdir pool +cd pool +macs2 callpeak -t ../../02-Mapping/bam/SRR576933.bam ../../02-Mapping/bam/SRR576934.bam -c ../../02-Mapping/bam/SRR576938.bam --format BAM --gsize 4639675 --name 'FNR_Anaerobic_pool' --bw 400 --fix-bimodal -p 1e-2 &> pool_MACS.out +cd .. + +# Analyse IDR +mkdir idr +cd idr +idr --samples ../replicates/FNR_Anaerobic_A_peaks.narrowPeak ../replicates/FNR_Anaerobic_B_peaks.narrowPeak --peak-list ../pool/FNR_Anaerobic_pool_peaks.narrowPeak \ +--input-file-type narrowPeak --output-file FNR_anaerobic_idr_peaks.bed --plot +cd ../.. + +## Préparation des fichiers pour l'analyse de motif +module add bedtools/2.29.2 + +mkdir 06-MotifAnalysis +cd 06-MotifAnalysis +samtools faidx ../data/Escherichia_coli_K12.fasta +bedtools getfasta -fi ../data/Escherichia_coli_K12.fasta \ +-bed ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed -fo FNR_anaerobic_idr_peaks.fa + +cd .. + +## Annotation des pics +module add homer/4.10 + +mkdir 07-PeakAnnotation +cd 07-PeakAnnotation +gunzip ../data/Escherichia_coli_K_12_MG1655.annotation.fixed.gtf.gz +gunzip ../data/Escherichia_coli_K12.fasta.gz + +# On met le fichier de pics dans le bon format +cut -f1-5 ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed | awk -F "\t" '{print $0"\t+"}' > FNR_anaerobic_idr_peaks.bed + +# lancement de l'annotation +annotatePeaks.pl FNR_anaerobic_idr_peaks.bed ../data/Escherichia_coli_K12.fasta -gtf ../data/Escherichia_coli_K_12_MG1655.annotation.fixed.gtf > FNR_anaerobic_idr_annotated_peaks.tsv + +# calcul de statistiques sur les résultats +tail -n 2 FNR_anaerobic_idr_final_peaks_annotation.tsv | awk '{print $11}' +vi FNR_anaerobic_idr_final_peaks_annotation.tsv +tail -n +2 FNR_anaerobic_idr_final_peaks_annotation.tsv | awk '{print $11}' +tail -n +2 FNR_anaerobic_idr_final_peaks_annotation.tsv | awk '{print $8}' | sort | uniq -c +tail -n +2 FNR_anaerobic_idr_final_peaks_annotation.tsv | awk '{if ($8=="promoter-TSS") print $11}' +tail -n +2 FNR_anaerobic_idr_final_peaks_annotation.tsv | awk '{if ($8=="promoter-TSS") print $11}' | wc -l +tail -n +2 FNR_anaerobic_idr_final_peaks_annotation.tsv | awk '{if ($8=="promoter-TSS") print $11}' > FNR_anaerobic_idr_final_peaks_annotation_officialGeneSymbols.tsv + +# On compresse de nouveau les fichiers +gzip ../data/Escherichia_coli_K_12_MG1655.annotation.fixed.gtf +gzip ../data/Escherichia_coli_K12.fasta + +cd .. diff --git a/2024/ebaiin1/chip-seq/hands-on/data/FNR_anaerobic_idr_peaks.fa b/2024/ebaiin1/chip-seq/hands-on/data/FNR_anaerobic_idr_peaks.fa new file mode 100644 index 0000000..7ab94ef --- /dev/null +++ b/2024/ebaiin1/chip-seq/hands-on/data/FNR_anaerobic_idr_peaks.fa @@ -0,0 +1,424 @@ +>gi|49175990|ref|NC_000913.2|:1406753-1407515 +GCAGGATATTGCCGATCGCCTTGGGCGCGGCCTTGACGAAATCGACGCCTGTATAGCACGGACTGGCGTGATGGCGGATGAAATCGCTCAGGTGATGCAGGAAAATTTAGCTCGTCGTACCTATACAATGTCGTTGATGGCAATGGTCTTTTTACCCAGTACCTTTCTGACCGGGTTATTTGGCGTCAACCTTGGTGGGATCCCTGGCGGCGGGTGGCAATTCGGATTTTCAATTTTTTGTATTCTGTTAGTTGTTCTTATTGGTGGTGTTGCTTTATGGTTGCATCGTAGTAAATGGTTGTAACAAAAGCAATTTTTCCGGCTGTCTGTATACAAAAACGCCGCAAAGTTTGAGCGAAGTCAATAAACTCTCTACCCATTCAGGGCAATATCTCTCTTGCAGGTGAATGCAACGTCAAGCGATGGGCGTTGCGCTCCATATTGTCTTACTTCCTTTTTTGAATTACTGCATAGCACAATTGATTCGTACGACGCCGACTTTGATGAGTCGGCTTTTTTTTGCCTGTTATTTATCAGCGTCTACCCTTTAAGAGTCCACCCAATGACCAGAGGGAAATATGACGACACTTATTTATTTGCAAATTCCTGTCCCTGAACCGATTCCTGGCGATCCTGTTCCAGTGCCCGATCCGATCCCTCGCCCGCAACCCATGCCTGACCCACCACCCGATGAAGAACCGATTAAATTGTCGCATCGTGAGCGTAGATCTGCGAGGATACGCGCCTGCTAACTTTGCGTCG +>gi|49175990|ref|NC_000913.2|:1626885-1627560 +TTCGACCACGTTTCTCCACCAGAAATGCTGTTACGCCAGCATCTTGATATTTTCTCTGCCCTGCAAAAACGTGATGGCGATGCGGTAGAACGTGCAATGACGCAACATTTGCAGGAAATCAGCGAATCCGTGCGCCAGATCCGCCAGGAAAACAGCGACTGGTTTAGCGAAGAGTAATTCATTTCCTCTCATCCCATCCGGGGTGAGAGTCTTTTCCCCCGCCTTATGGCTCATGCATGCATCAAAAAAGATGTGAGCTTGATCAAAAACAAAAAATATTTCACTCGACAGGAGTATTTATATTGCGCCCGTTACGTGGGCTTCGACTGTAAATCAGAAAGGAGAAAACACCTATGACGACCTACGATCGTAACCGTAACGCAATCACCACTGGCAGCCGTGTTATGGTTAGCGGCACCGGTCACACTGGCAAGATCCTGTCGATTGATACTGAAGGTCTGACCGCTGAGCAAATCCGCCGCGGAAAAACCGTAGTTGTTGAAGGTTGTGAAGAGAAACTGGCACCACTGGACCTGATTCGTCTCGGCATGAACTAAGCGTGTGAATGCCGCCGATGGCGGCATTGCTTTTTTACTTCACGGAATATTTTGCCACGGTCGCTTTCGCGCCATGCGCTAATAAAGACAAGTACGTTTCCGTCACTCTTGCAGTAAA +>gi|49175990|ref|NC_000913.2|:1718694-1719311 +GTTGAATTTGCGACTGGTCTGGAGGTAACTGATGGATATTGTGTAACGTAACCCAATGGGTTTGTGTTAACTCCAGCGGTTTCAGGCGATGGTCTATCAGAGCACGCCATATGCGCACCAACCGTGCCAGATCAGAACCTAGTGGCGATTCCAATTTCATCTCCTTATAATTAGCTTGCTAAGATATTATGCGGCTTTTAGAATAGTGTGCAGCAATTGTATTGCTAAAACAAATGTATTGCTGCATTTGGTTACCGTCAGACATATTTTTCAGAAATTGCGCGTAAATTTTTCGCACTTAAAGAATATTTATTAATCTAACGCAATATATTCGGTCGTAAAGGAATCTACTTTGTGAAGTTTATGCTCAATGCAACAGGATTGCCCTTACAAGATCTGGTGTTCGGTGCGTCCGTCTACTTTCCTCCGTTTTTCAAAGCATTCGCGTTTGGATTCGTCATCTGGCTTGTCGTACACCGCCTGCTTCGTGGCTGGATCTACGCCGGTGACATCTGGCATCCCTTGTTAATGGATTTATCGCTGTTTGCGATTTGCGTTTGCCTTGCTCTGGCAATACTGATTGCGTGGTAACTATGTCAATTAAAACAATTAAGTAT +>gi|49175990|ref|NC_000913.2|:1514675-1515936 +TACTATGCGCCGACGCTGCTGGCTGGCGCATTACAGGACGATGCCATCGTGCAAAAAGAGGTATTTGGTCCAGTAGTGAGTGTTACGCCCTTCGACAACGAAGAACAGGTGGTGAACTGGGCGAATGACAGCCAGTACGGACTTGCATCTTCGGTATGGACGAAAGATGTGGGCAGGGCGCATCGCGTCAGCGCACGGCTGCAATATGGTTGTACCTGGGTCAATACCCATTTCATGCTGGTAAGTGAAATGCCGCACGGTGGGCAGAAACTTTCTGGTTACGGCAAGGATATGTCACTTTATGGGCTGGAGGATTACACCGTCGTCCGCCACGTCATGGTTAAACATTAAGGATAATATTGCAGATCGTAAGAGTATGAGATGATCTTGCGTACTGTCAGAGCACTGATTTCTCTGACAGTACGTGCTGCGTGCAAAATCACTACATCAAATAAACCAGCCAAATCTCATAGAGATAAGAGTAAACAAACCAACAGCAGCAAAGAAATTTATCAGTACTACAGTTCTACTGGAAACATTCATTTTTGCCACCTTTTATCATTGCCCCTATGTGTATAGTACGGGTTAAGAAAATCCGCAATAATACTCATTCTGAAGATGGATAAGGGCAAGTTGCTGTTTGATGATTTTTTCTCAAACTGGTGAAAAAACTTGATGCACGTCAAAAAATGACGCATATTTGCGCGCGTTTTATTCATCTGGCTGGACGCCCGTACATGTCTCTCTATCAACACATGCTTGTTTTTTATGCGGTTATGGCAGCAATCGCATTTCTTATCACCTGGTTTCTTTCTCACGATAAGAAACGCATCCGTTTCTTAAGCGCCTTTCTGGTGGGGGCAACATGGCCGATGAGTTTCCCGGTGGCGCTGTTGTTTTCACTGTTTTAAATCTTACTCACTGGCTTGACGGCACCACGCAGACTTATATCATTTGGATGAATCGATAAATTTCACAAGTGGCTAAGGAGAAAGTATGTCGCATCTGGATGAAGTCATCGCGCGCGTGGATGCCGCGATTGAAGAGAGCGTCATTGCCCATATGAACGAATTATTAATCGCCCTGAGCGATGACGCGGAGTTAAGTCGGGAAGATCGCTACACCCAGCAGCAACGTCTGCGCACAGCGATCGCCCATCACGGTCGCAAGCATAAAGAAGATATGGAAGCGCGCCACGAACAGTTAACCAAAGGCGGCACCATCCTCTGATTAAAATGAGCGTCTGGCTACCAACCAGGCG +>gi|49175990|ref|NC_000913.2|:2562182-2562740 +TGGGTCGCGACTACGCATGGAAACATGTAAAAGAAGACAGGTTGCCATGTCTGTTTGCCGCCGGGGTTGTTGCTGATAACATCCTGGTAAAATACCATTTCAGCAATGAAACTGAGTTCAAACTTAAAGCGGCAGCGGTCAGAGAATGCGCCAACGTAGCAGAAAACAACGGCTTTGTTTTGGTAGGTAGAGTATTTGAAAAACGTAATTGATTGCGAATAGCAACAACTAAACAACCAATACACCCGGCCCCTCGCCGGGTTTTTTGTGATCTGCGTCAATATTCCCTTCCGTTACGCTTACCCTCCACCAATACGCGTGTTAACCTCCCAATGGATTCTATGAGATGGGAGATAAAAATCATGAGTTACGAAATCAAAATTTGCGACATCCTCAAAGGCGCTGCGATGGAAGGACAATATAAGGGCGCTCAACGTGGGGCCAAATGCGAGGAGATCGCAAACGAGTTAACTCGCCGGGGAGTTAAAAACAACAAGGGAGAGGTTATTACTAAGGGCGGCGTTAGCCATTGGCTGGAAGGTAGACGGGAGCCAAATT +>gi|49175990|ref|NC_000913.2|:1935218-1935800 +ATTTCTCACACTGGAAGAACAAAAAATCTGGTCGAGCTGGCGCAGCTGGCACGCGAAAACGACGCCATGGTGATTGCCCTCACCTCTGCGGGTACCCCGCTCGCCCGGGAAGCAACGCTGGCAATTACCCTCGACGTACCGGAAGATACTGACATTTATATGCCCATGGTTTCTCGACTTGCACAGCTGACCGTGATAGATGTGCTGGCGACAGGATTTACTTTGCGACGCGGTGCAAAATTCAGAGATAACTTGAAGCGGGTCAAAGAAGCGCTGAAGGAATCGCGTTTTGATAAGCAGTTACTTAATTTAAGTGACGATCGCTAAAAACGACTGTCACTGTCCTAATCTTATACGACATCCGAATGAGATTAATTTATCGCCATCGCGGCGTTATTTCATTCGGATTTCATGTTCAAGCAACACCTGGTTGTTTCAGTCAACGGAGTATTACATGTCCAGAAGGCTTCGCAGAACAAAAATCGTTACCACGTTAGGCCCAGCAACAGATCGCGATAATAATCTTGAAAAAGTTATCGCGGCGGGTGCCAACGTTGTACGTATGAACTTTTCTCACGGCTC +>gi|49175990|ref|NC_000913.2|:769870-770756 +ATTGTTGACAAAGGGCGCTTTGTTCATGCCGGATACGGCATGAACGCTTTATTCGGTCTACAAAAGCAGGCAAATTCAATATATTGCAGAGATGATGTAGACACTGGCAAGCGTAGCGCATCAGGTAATTTTGCGTTTATCTTCACTCTCAAGCCACGTATATGTGGCTTTATTTTTAACAAAATAATAACCCTGGGTGAGTTAATTATAATATAATTATAAGTTAACTAAATGTTAATATTGGCGGGGTGGATTTATGCCTTTATTAGTAATCCTGAAACTCTGCGTCGTATTAGCCAGTGACCAAAAAAAGAATTAAGGTCAACCGTGCTGTTTTTGCTTCGTCTCTTTTTATCTTTAATTGCCAACCGAAACTAATTTCAGCCTTATAACTCACACATTTTAAACATAAATGTCACTAAAGTTACCTTATTGAAACATGATTAACATAATTTGTAGGAATTGATATTTATCAATGTATAAGTCTTGGAAATGGGCATCAAAAAGAGATAAATTGTTCTCGATCAAATTGGCTGAAAGGCGGTAATTTAGCTATAAATTGATCACCGTCGAAAAATGCAAATTTGCTTCAACAAAAACCTGTTTATTGTAAGGATTTTGCGGCGTAATATATACGTGGGATCAATTTGAGTTTTTATTAACATGTTTGCAACCTTTCTTTACGCCGTTTTTGTGTGCATTCACATGGTATGATGAAAGTGTTCAAACAAATTTCTATTGGGGCATGCGTGTGACCCTTTCTAACGGGGTTCACTCTCGGAGTCTTCATGCGATGAGCAAGGAGTCATGATGTTAGATATAGTCGAACTGTCGCGCTTACAGTTTGCCTTGACCGCGATGTACCACTTCCTTTTTGTGCCACTGA +>gi|49175990|ref|NC_000913.2|:1664981-1665532 +AACTGTGCGCCACGCTACTTTCTTCTTCGCTTAACGCGATAGGATTAACTGCTTCATAAGGCAGTTCGATGGTTGAAACACTCTGCAACACCAGGGCATCTTTATTACGCAGCCCTTCGGGTGTCTCTTTGCTCCCCTTCGCTACGGGTTTATATCCCGCAACCGTTTTTCCCTGGGAGGCTAACGCTTGTAGCAATGCGCGGGAAACCACCGTTTTCCCTACAGAAGTGTCTGTACCGGTAATAAAGAAACGCTTCAGCATCACTAACTCCACCGTTATGCTTCACAAATATAAACCAGGAAAATAATTAACCTTGAAAGTCTAAGTTATGCTTTCCTGGCCCAAATTGAGATAGCGCAAATTTTGGTAGAACAGTTAAAAAATGTTAACCCTGCAACAGACGAATCAACAAAGAACCGTTATACATCGCGTCTTTTACCAGTGCAGCGCCTGCCATCGTGCCCTGGTTAGAAAACTGAGTACTCTCAACGCTGATGTGCTGACTATACGCAGGAAGGGCCTGCTGACGGATGCTGTCTGAGATGACCGG +>gi|49175990|ref|NC_000913.2|:2713967-2714175 +GTCGCCAGTTAAATATCGTGCTTGCCCTGTTGCTGGTCTATTGCGCGGTACGCATTTTCTATTAACGAAAAAAAGCGGAAGAGGTCGCCCTCTTCCGCTTAGTAACTTGCTACTTAAGCCTTACAGGCTTTCAGTAAAGGTACGAGCGATAACGTCGCGCTGCTGTTCCGGAGTCAGAGAGTTAAAGCGAACTGCATAACCGGATACA +>gi|49175990|ref|NC_000913.2|:2714220-2715584 +TCCAGAGTTTCGCGACGCAGAACGTTAACGTTCAGGTGTTGACCACCTTCAACGCGAACTTCTGGTTTCACTTCTACTGGAACTTCACGGTATTCAATGTCACCCAGTTTGCTTACTGCAACCACTTCATCTTCTGCATAACCTGCTTTTGCAACGATGCAACGCGCTTCGCCTTTTTCGCTGTCCAGCAGCCAGAAAGAGTTCAGCAGATCGTCGTTAGCGGCTTTAGTAATCTGGATACCTGTAATCATGTGATGCCTCCCCGGCAAAATTATTTGATTTGTTCAGCCTGTCGCGGCCAATTGGTAAAACCATTGTTGCTTGAGTGTATATATACTCCTCAAACACCCTTGAATCTTTGATTTAAATCAATAAAAACCACACATCAAGTATGGTCGCAAATGGATTTTATTGTTTTACATCAACTTATGCGGGTGTGAAATTTTACCAATTTACATTTTTTTGCACTCGTTTAAGTCTAAAAAATGAGCATGATTTTGTTCTGTAGAAAGAAGCAGTTAAGCTAGGCGGATTGAAGATTCGCAGGAGAGCGAGATGGCTAACGAATTAACCTGGCATGACGTGCTGGCTGAAGAGAAGCAGCAACCCTATTTTCTTAATACCCTTCAGACCGTCGCCAGCGAGCGGCAGTCCGGCGTCACTATCTACCCACCACAAAAAGATGTCTTTAACGCGTTCCGCTTTACAGAGTTGGGTGACGTTAAAGTGGTGATTCTCGGCCAGGATCCTTATCACGGACCGGGACAGGCGCATGGTCTGGCATTTTCCGTTCGTCCCGGCATTGCCATTCCTCCGTCATTATTGAATATGTATAAAGAGCTGGAAAATACTATTCCGGGCTTCACCCGCCCTAATCATGGTTATCTTGAAAGCTGGGCGCGTCAGGGCGTTCTGCTACTCAATACTGTGTTGACGGTACGCGCAGGTCAGGCGCATTCCCACGCCAGCCTCGGCTGGGAAACCTTCACCGATAAAGTGATCAGCCTGATTAACCAGCATCGCGAAGGCGTGGTGTTTTTGTTGTGGGGATCGCATGCGCAAAAGAAAGGGGCGATTATAGATAAGCAACGCCATCATGTACTGAAAGCACCGCATCCGTCGCCGCTTTCGGCGCATCGTGGATTCTTTGGCTGCAACCATTTTGTGCTGGCAAATCAGTGGCTGGAACAACGTGGCGAGACGCCGATTGACTGGATGCCAGTATTACCGGCAGAGAGTGAGTAAATTTGCGGGGAAATGCCGGATGGCAGAGTTGCCACCCGGCTGATTTATCAGGCTTTATTCTGACGCCACCATTCACCAAGCAAAACGCCGGTTGCGACAGAGATATTCAGCCCGGCAAC +>gi|49175990|ref|NC_000913.2|:1163945-1164589 +TGCCTGCGCATGTGATCTACAGTGATGTTGATCCGCGTCCGGCGAGCGGTTCTCCCTACTGGCTGAAAACCGTTTTGCGTCAGGAACTGGGTTTTGACGGCGTGATTTTCTCTGACGATTTATCGATGGAAGGTGCCGCGATTATGGGCAGTTATGCCGAACGCGGGCAGGCTTCACTGGATGCGGGTTGCGATATGATCCTGGTCTGCAATAATCGTAAAGGGGCCGTCAGCGTGTTAGATAATCTGTCACCGATCAAGGCAGAACGTGTTACACGTTTGTATCATAAAGGTTCATTTTCGCGACAGGAACTGATGGACTCGGCTCGCTGGAAAGCGATCAGCACCCGTCTGAATCAGTTACATGAACGCTGGCAGGAAGAGAAAGCAGGTCACTAACCCTGGCTTATGTGAGGAAGCGATGATTATCTATTTACACGGTTTTGACTCTAACAGTCCGGGTAACCACGAGAAAGTCTTACAATTGCAGTTTATTGACCCGGATGTACGCTTGATAAGCTACAGTACGCGGCATCCGAAACATGATATGCAGCATCTGCTTAAAGAAGTGGACAAAATGTTGCAACTGAACGTTGACGAGCGTCCGCTAATTTGCGGCGTTGGCTTGGGCGGATACTGGGCGGA +>gi|49175990|ref|NC_000913.2|:2618750-2619203 +TTCGAGGTCGGCGGTCGCTTCGTAAGTCAGCAGGCTACCCACTTCGGAAGCGAGTTCGCGAAAGCGCTTGGTGCTGATATCTTGCTCACGCATCAGTCCCAGCTTGTGTTTGACGAGTGGGTGTTTGACTTCCACGATCTTCATACTCTTTCTCCTTTGAGGGGCAGCCACAAAAAAAATCGACGGATTATACCTCCTTTCTTCAAGGCGGCAATATTCTTTTCGTTGACTTTAGTCAAAATGATAACGGTTTGAGATAAAGTTATTTTATATTCAGATGGTTATGAAAGAAGATTATTCCATCCGAAAACTAACCTTTACCCTGGCACAAGTCTTCTTTCGCCGCGCGCCTGGGGAAAAGACGTGCAAAAAGGTTGTGTAAAGCAGTCTCGCAAACGTTTGCTTTCCCTGTTAGAATTGCGCCGAATTTTATTTTTCTACCGCAAGTAACGC +>gi|49175990|ref|NC_000913.2|:952876-954233 +CCCAACCAACAAACCACCACCGATAATGTTGCCGATCGTAACCGGAATCAGGTTATCAGTGATGAAATTCATCACGGTCAGGTGAGAAAAATTTTCCGGTGCAGAACCGACTGCGGTCCAAAATTCCGGGGATGCGAAGTCGCGGATTACAATACCCATCGGGATCATAAACATGTTTGCGATACTGTGCTCAAAACCGCTGGCAACAAACATCGCGACCGGCAGCACCATAATGAACGCTTTGTCCATCAGGCTGCGGCCAGAATAACTCATCCATACTGCCAGACATACCATCAGGTTTGCCAGGATACCAAGACAGACGGCCTCAATAAAAGTATGGTGCACTTTGTGGTCGGCGGTTTGTAGGACGTTTAGTCCCCATTGACCATTTGCGGTCATATACTCGCCGGAAAGCCACATTAAAAGTACAAACAGCAGTGCGCCGACCAGGTTGCCAAAATAGACATTTAGCCAGTTTTTCGCCAACTGACCCCAGGTGATGCGCCCACTCGCCTTAGCAACAACAATCAACACGGTGGAAGTAAAGAGATCGGCTCCGCAGACAACACAAAGAATCAGCCCCAGAGAGAAGCAAATGCCGCCAACCAGTTTTGCCATGCCGAAGGGCATTGTGCCTGTGCCAGTGGTTGCTGTGATATAGAAGACGAATGCGATTGAGATGAAAACACCGGCGGTAATCGCCAGATAGAAAGTCTTAAGCGGATGTTTCGTTGCTTTATAGACACCCGCCTCTTCGGCCACTTTGGCCATTGCAGCAGGAAGTAAAAGATCAAAAGGGTTGTCAGCTTTCACACTAACTCTCTCTTTATTAAGTCGGCGACGAGATACTAACAAAGCATTATAGATGAGAAATTGATATAGATCATATCTCGCCTGGCTTATAGGCCCGTAACTCGCATGGTTTTTATGCAAATACGGAGTAAATATTTGATTATCCAAATAAAAATAAATTTTAAAAATTAACAAATGAGTTGAATTTTTTCCGCATCCTCCGCTAAAACAGTTAATTAAAAGGGAGCATCAGGCGAATAAAGTAACAATATCGATCGTATTTATTAAATACAAATTACCGATATTTAACCTTATAATTACAATTATTTTATTAATGCAAATATATGTAAAGCGGGGCATTAAAAAAACGCCCCGTAATATAACTCAGACTAATCATTAAGCCTACATTGCGTAGGCTATTTGATTTTATTTTGCCCAGAATGCTGCTTTGGCGCGCTGCAGCTTTTCGTAGGCCTTCAACAACGACTGATGTGCAGCAAACGCGTGCAGATCGCTATCTACCGGTTGCAGGCCGTAAAACGCCGCTTCGCCGCTCATTGCCGCA +>gi|49175990|ref|NC_000913.2|:1987282-1987786 +GGCAAATCTTACTCTGATCGGTCGAAAAACCGTCTTTCGAAGGTATAAATTTGCCTTTTGCTGCCAGAAACGCTACCAGCTCTCCCGCTGTCATCCCTTCTGCCGAGCAGGTGTGAAAACGTGCCTGTTCACCAAAACGCGCTTTAATCGCAGCTTCCAGACTGGCATGCGTATATTGCTCGCCTGATTCAATCATCATATTTAACACTTCATGACCGTGAATAGAGTCCATCGTCCCTCCTCAAAAAAAGCCTAGCGTAGCGATTGCCGCTTATGAAGACTTTGCGCCAGCGCAGGACTGAATGCTTTTTATTGTACATTTATATTTACACCATATGTAACGTCGGTTTGACGAAGCAGCCGTTATGCCTTAACCTGCGCCGCAGATATCACTCATAAAGATCGTCAGGACAGAAGAAAGCGTGAAAAACAGAACCCTGGGAAGTGTTTTTATCGTGGCGGGAACCACAATTGGCGCAGGCATGCTGGCAATGCCGCTGGCTG +>gi|49175990|ref|NC_000913.2|:1296634-1297977 +GACATCAGTACAGATGCAACTGCACGTTTGATATCAGCAGTTTCATCGATAACAACTGGAGTGTTGCCCGCGCCTACACCGATAGCTGGTTTACCGGAGCTGTATGCGGCTTTAACCATGCCCGGACCACCAGTCGCGAGGATCAGGTTGATGTCTGGGTGGTGCATCAGTGCGTTAGACAGTTCAACAGAAGGTTGATCGATCCAGCCGATCAGATCTTTCGGAGCACCGGCAGCGATAGCAGCCTGCAGAACGATATCAGCCGCTTTGTTGGTGGCATCTTTTGCACGCGGGTGCGGGGAGAAGATAATGGCGTTACGGGTCTTCAGACTGATCAGCGATTTGAAGATAGCAGTTGAAGTCGGGTTAGTGGTCGGAACGATACCGCAAATAATACCGATTGGTTCAGCGATAGTGATGGTACCAAAAGTGTCGTCTTCAGACAGAACACCACAGGTTTTTTCATCTTTATAGGCGTTGTAGATATATTCAGAAGCAAAGTGGTTTTTGATCACTTTATCTTCGACGATACCCATGCCGGATTCGGCAACGGCCATTTTCGCGAGTGGGATTCGAGCATCTGCAGCAGCCAGAGCGGCGGCGCGGAAGATTTTGTCTACTTGCTCTTGAGTGAAACTGGCATATTCACGCTGGGCTTTTTTTACACGCTCTACGAGTGCGTTAAGTTCAGCGACATTAGTAACAGCCATAATGCTCTCCTGATAATGTTAAACTTTTTTAGTAAATCATCTGCTCGAATACGAGAGTATAGTCAGTGCGGTGATGATTTGCTTAACCTATGAAAATCAAAAGCTTACTCGCGCTCACACTCACTGTGATTTACTAAAAGAGTTTAAACATTAGAGTTATTATCTCTAATGCGTCACTTCCAGGTGGCGTAAGCAAGATTACTCACTTCTGGGTACTGATTACGTGATCCAAATCAAATTTTTGCAAAGCTGACACCTTTCAGCATCGCTTTTCGCCATTATAGCTAACAGTTAATAAATTGTAGTATGATTTGGTGGCTACATTAGCATGTTTTGCACAACTAGATAACAATAACGAATGATAGCAATTTTAAGTAGTTAGGAGGTGAAAAATGCTGTCAAAAGGCGTATTGTCAGCGCGTCTTTTCAACCTTATTTATGGCTAACATTATCCGGCTTTTGCTTCGGAGCTAACCGTGATTCAGACCTTTTTTGATTTTCCCGTTTACTTCAAATTTTTCATCGGGTTATTTGCGCTGGTCAACCCGGTAGGGATTATTCCCGTCTTTATCAGCATGACCAGTTATCAGACAGCGGCAGCGCGAAACAAAACTAACCTTACAGCCAACCTGT +>gi|49175990|ref|NC_000913.2|:1164699-1165554 +CCACTAAGTGTGTGACCAACTTCCGTGAGAAGAATCGCGATCGTTGCCTGGTGATTTTGTCGCGTAATGATGAAGCGCTTAACAGCCAGCGGACATCTGAAGAGTTGCATCATTATTACGAGATTGTCTGGGACGAAGAGCAGACGCACAAATTCAAGAATATCTCCCCGCATTTACAGCGCATTAAAGCGTTCAAAACCCTCGGGTAAATGCCCTCGTCGCATCAGGTAACCTTGCCGGTACCTGATGCGCTCCGAATTCTGTGGGTCGGATAAGGCGTCCACGCCGCATCCGACAGTCGAGCATCAATGCCTGATGCGCTTCTTATCAGGCCTACCGAACGCCCTGCATACACCCCTCACTCTATATCACTCTCACAAATTCGCTCAAATAATAAACAATAAACTCTGTTTTTTGATCTCACCCGGTAAAGTCGCCTATCTTTTCAGCAACAAAACTTGATTAACATCAATTTTGGTATGACCAATGCACCATTCATGTTATTCTCAATAGCGAAGAACATTTTCATTGCTGTAACCTGTTGTTAATTAAGAGCTATGTTAATAACCATTAATTAACAATTGGTTAATAAATTTAAGGGGGTCACGTTGACTACGCCATTGAAAAAGATTGTGATTGTCGGCGGCGGTGCTGGTGGGCTGGAAATGGCAACACAGCTGGGGCATAAGCTGGGACGCAAGAAAAAAGCCAAAATTACGCTGGTCGATCGTAACCACAGCCACCTGTGGAAACCGCTGCTGCACGAAGTGGCGACTGGCTCGCTTGATGAAGGCGTCGATGCGTTGAGCTATCTGGCCCATGCGCGCAATCATGGTTTCCAGTTCCAGCTGGGTT +>gi|49175990|ref|NC_000913.2|:3242450-3243111 +GCGTCCAGTGGTATAACGGGTTGCCGATAGTGTGCGGAACAGTCGCCGCCCAGGCGTCAAATTTTTCACGGTCAGACGCATCACCGGTACACAGACGCTCGGCCACACCGTTGGTACGCATAGCGCGCCATTTGTAGTGATCGCCTTTCAGCCAGATGTCATACAGGTTTTTAAAACGATAGTCTTCCGCAATCTGCTGCGGCGGCAAATGGCAATGGTAATCGAAAATCGGCTGGTCTTTTGCGTAGTCGTGATACAGACGGCGGGCAAATTCGGTATCTAACAGGAAATCTTCAGTCATAAACGGAGTCATTTTCGTCTTCCTCTCAACGAGTGAGCTAGCTTGCTTATGGTGCGATGCTGACAAAGTTATCACACCAATTTCCAGAGTCCGAAGATATTTTCGTGAGTTAGATCAATAAACGTAGTTAAAAAAATTACTCTCAAAGTGGTAAATCTCGCTGCAGGCCGCGCCAGTACTGGCCTTGCTGTCGTCAGGTAATGTCCCTACAAATATTCCCACATTTGTGATGGCTCTCACCTTTTAAAGTTGTATGACAAGTTATCTTTCTGCCGTCGCAAATCATAAGTCGACGGAATGCAAATTGCCGATTCATTCATTTGTTAGATGAATCGGGTTAACCGGTACGGAAGCCGAATT +>gi|49175990|ref|NC_000913.2|:1735310-1735967 +TCATGCCTTATCTCCACCTCTTCGCGTCATTACGCGATATTCATTAAAGTGGCGAAAGCATGACAGCAATCACAAAAAAATGAAAATAACAAAAAGAGAAAACACTTTTGCCATTTTGCTAACAAACAGGAAGGAGATGCGAGGGAGAACGCGCTCCCTCGAGAGGAAATCAGTGCAGCGCGGCAGTCAAACCCACGGCTACGATCAAACCGAGGACGATAATCGTTGTTACCAGTGAAAATTTAAGGTCGGTGCTCATCAAGTTTTCTCCTTTTTTATTACCACACAAAAAGTGATATTACGCATTTTTACACACTGTGATGAAAAAATCTCCCGTCATTTATAATGATAAGTGTTTTTACCACTTCCCCTTTTCGTCAAGATCGGCCAAAATTCCACGCTTACACTATTTGCGTACTGGCCATTGACCCCTTCCTGACGCTCCGTGTCGTTTTTCCGGCGTACCGCAACACTTTTGTTGTGCGTAAGGTGTGTAAAGGCAAACGTTTACCTTGCGATTTTGCAGGAGCTGAAGTTAGGGTCTGGAGTGAAATGGAATGGCAACAATAAAAGATGTAGCGAAACGAGCAAACGTTTCCACTACAACTGTGTCACACGTGATCAACAAAACACGTTTCGTCGCTGAAGAAACGCGCA +>gi|49175990|ref|NC_000913.2|:3144188-3144666 +CTTAACCCCATGGTGGCGGCTAATGCTGCACAAAGCTTCATGAAATCACGACGGTTAATGCCGTGAGAATGGATGAGGGTGTTATCTCCAGTCATTTATAGTTATTCCGTTGCGAAGACCTGGCATATATTTTGCCTCAATCGCAAAATCAATAATGCGATCGATGCGCCATTTACCACACATTTATTATGGTTATCGATATCATGATACTGCGGCGGGGCGCTAAAACGAAGGGAAGAAGCATTAGTGTAATTAATTAAAGCAAGATAATACGTATGTTTGATCAATTTTCGAGGCGATAACCGGCCATAAAAAAGCGCAACGTAGAACAGGAATTATTCTCACGAAAGAAATCCATTACCATCTCTTTATCCAGTCCATAGCGGCGCGTCAGAATACCCGCTTCCCAGGCGCTAAGCTGGCGATCTCCGGTCTTTTCGTACATCCGGTGTGAGTAGCCTAATACAAAACCGCGTTT +>gi|49175990|ref|NC_000913.2|:1311665-1312151 +CGGACTGCGCCGTGATAGACATGCACTCTCCTTGAGTTTTATGCAGGTGTTAATTAGCGGGCAATTGTACCCTGGTTAAGCCTCGGGTGGCAGCATCAGGTCTTGTATGGAAAAGCATATCTTTGTAAAGCGGAGGTAATTGCTGGCCTTCAGGCATGGGCAAATCGCGGCAAGTGGCGACCGAGTTAATATTTGCGTAGCGAAAATATTTAAAAATTGATTTAAATCACATTAACCAGGATTCTCAATGCAACTTCTAAATTAATCCAGATCAATAAAGGGTGAATTATCATATGTAATGTGATCTATGTAGGATCATTTGTTACTCCAATGTAGGTATATTCGTCACGTTTTTATAACCATAACGACGGAGCGGATATGAAAAAGTTAACAGTGGCGGCTTTGGCAGTAACAACTCTTCTCTCTGGCAGTGCCTTTGCGCATGAAGCAGGCGAATTTTTTATGCGTGCAGGTTCTGCAACCGTA +>gi|49175990|ref|NC_000913.2|:1003753-1004164 +AGTTGTGCCGCCTAAAAGCCAGCGTCAGACAAGCTGGTGTCAGGCAATCGCACCGTGTCATGTCGCCTGGCGGGCGATAAATGATTACGGCGGGTTGAGTGCAAAGAAGGAGCAAAATCTGCCCTGAAACAGGTTCGGAAAACGTTTGCGTTTTTTTTGCCGCAGGTCAATTCCCTTTTGGTCCGAACTCGCACATAATACGCCCCCGGTTTGCACACCGGGAATCCAGGAGAGTTCATGTACTACCCCTTCGTTCGTAAAGCCCTTTTCCAGCTCGATCCAGAGCGCGCTCATGAGTTTACTTTTCAGCAATTACGCCGTATTACAGGAACGCCGTTTGAAGCACTGGTGCGGCAGAAAGTGCCTGCGAAACCTGTTAACTGCATGGGCCTGACGTTTAAAAATCCGCTT +>gi|49175990|ref|NC_000913.2|:2411209-2411727 +AGCGGGACAACGTTCAAAACATTTTGTCTTCCATACCCACTATCAGGTATCCTTTAGCAGCCTGAAGGCCTAAGTAGTACATATTCATTGAGTCGTCAAATTCATATACATTATGCCATTGGCTGAAAATTACGCAAAATGGCATAGACTCAAGATATTTCTTCCATCATGCAAAAAAAATTTGCAGTGCATGATGTTAATCATAAATGTCGGTGTCATCATGCGCTACGCTCTATGGCTCCCTGACGTTTTTTTAGCCACGTATCAATTATAGGTACTTCCATGTCGAGTAAGTTAGTACTGGTTCTGAACTGCGGTAGTTCTTCACTGAAATTTGCCATCATCGATGCAGTAAATGGTGAAGAGTACCTTTCTGGTTTAGCCGAATGTTTCCACCTGCCCGAAGCACGTATCAAATGGAAAATGGACGGCAATAAACAGGAAGCGGCTTTAGGTGCAGGCGCCGCTCACAGCGAAGCGCTCAACTTTATCGTTAATACTATTCTGGCACAAAAACC +>gi|49175990|ref|NC_000913.2|:2558188-2558592 +TGCCGGATTCGTCCGGCTTTTCTTTTTCGCTAATTATTCCATTATTCCAATTAAGTGGAATAAATTATTCAATATCAAGAACTTAAATCCATGTTTGCAACGAAAGATCCAGAGTTCGAAAATAGGATCAACACAAATAAAAGCCCCCGGAATGCTGCAACATGCCGGGGGCGGTATGAGAAACAAGCTAAAGGAGAATTTCTCATGTCTGATATGCTAGCAGTAGAACAAGAAACAAACAATGATGTTCGTCAATTTTTAAACAAAATTAACGAGTTACGGAATAAAGCACCCAAAAACGAAGAGACTAAACACGAAGAACACACGCCAGATAATCACGAGGAAACAGATCATCACGAGGCAAAACAGCAGGAGCAAGCCTGGCGTGGCAACCTGCGCTACCT +>gi|49175990|ref|NC_000913.2|:2403242-2403669 +ATAACCTTTTGTCAACTTTAACAAAAGTTTCTTCACATTAGTTTACATAATATCAACACCATTAGCATTTAATGCCCTTTCACCCCAGATCCTTGACGACTCCAGGATAATTAGATGTTGTTGAATCGTGTCCGTTGTGAAGCAATGGAAAAAATACGGGTCTATTTTGACAGGAATTTGTGTCGATTCCTCCCCCCAAAAGAGAGTATTTTCTTGATCTGTGACACGCTTTTGTCATTCCATAACAAAAACGCAGCAACAAATTTACGTATTTTTTAACATCATTGTAGCAGGTGATTTTTTTCAGGCGATTATTTGTGCGTTCGGGACGTGAATCTCTGGTGGTTGAAAAATGAACAGTTTTGTACGTTCTGCACTATGCGATGAAGGATTTTTACTAAAAAAAAGCCGCTGGGGTTTAAAACAC +>gi|49175990|ref|NC_000913.2|:3611231-3611845 +CTGTTGAGTGATGAAGGCGAAGTCGGCTGCGATATCGAAGTGATTCGCCCGCGCGCCAACTGGCGCTGGCTGGCGAACGCCGTATTCAGCCTCGGGGAACACGCTGAGATGGACGCCGTGCATCCTGATCAGCAACTGGAAATGTTCTGGCGCATCTGGACGCGCAAAGAAGCCATCGTTAAACAGCGTGGCGGCAGCGCCTGGCAAATCGTCAGCGTAGACAGCACCTATCACTCCTCGCTGTCAGTCAGCCATTGTCAGCTTGAAAATTTAAGCCTTGCGATCTGCACCCCTACTCCCTTTACGCTCACCGCCGACAGTGTGCAATGGATCGATTCAGTTAACTGATCCGCCCACCCGACTGCCCATCTATTGATCCAGAACAGGTAATCAGTATGACGAATACTTAAAATCGTCATACTTATTTCCGCCATCTATTTTAATCCATTGGGGTTACCATGCTCTCCACACTCCGCCGCACTCTATTTGCGCTGCTGGCTTGTGCGTCTTTTATCGTCCATGCCGCTGCACCAGATGAAATCACCACCGCCTGGCCGGTGAATGTCGGGCCACTAAACCCGCACCTTTACACGCCTAACCAGATGTTCGCCCAG +>gi|49175990|ref|NC_000913.2|:121382-122203 +AACCAGCAATGGCGTAACCCAGGATAATCCCTGGCCCTGCGGACTGTATTACGGAGGCGCTACCCAGGAATAACCCGGTCCCTATCGCGCCACCCAGCGCGATAAGCTGAATATGGCGGTTTTTAAGGCCGCGCTTTAGCTGCTCGCCGTGCTGTTGACCTTCCATCATGAAACCTCGTGCGGTGGTTGTTTTTTTGATCTACGCAGTGATGCGTGTGTAAGTTTGCAATTCCGTTTGTTGTATTAATTTGTTTACATCAAAGAAGTTTGAATTGTTACAAAAAGACTTCCGTCAGATCAAGAATAATGGTATGCGGCAGCGAATGCACCCGCTTTATGCATGGTTGAAGATGAGTTGCTTAAAAAGAAACCGTTTGTAAAGCTCAGCCTCAACCCCTCTCAATATGTAGAATGAATTTAAATTCGTTTTAATTGAATTAAAAATCACAAAATTGGTAAGTGAATCGGTTCAATTCGGATTTTTATAGTTTAATAATCGTTAAAAAACTCCTTTCCTACGTAAAGTCTACATTTGTGCATAGTTACAACTTTGAAACGTTATATATGTCAAGTTGTTAAAATGTGCACAGTTTCATGATTTCAATCAAAACCTGTATGGACATAAGGTGAATACTTTGTTACTTTAGCGTCACAGACATGAAATTGGTAAGACCAATTGACTTCGGCAAGTGGCTTAAGACAGGAACTCATGGCCTACAGCAAAATCCGCCAACCAAAACTCTCCGATGTGATTGAGCAGCAACTGGAGTTTTTGATCCTCGAAGGCACTCTCCGCCCGGGCGAAAAACTCCCACCGGAAC +>gi|49175990|ref|NC_000913.2|:1860409-1861046 +GACAAATTTTTTTTCAGTTCTTCTGCCGAAGGTTTATTAGCCATTTGCTCACATCTCACTTTAATCGTGCTCACATTACGTGACTGATTCTAACAAAACATTAACACCAACTGGCAAAATTTTGTCCTAAACTTGATCTCGACGAAATGGCTGCACCTAAATCGTGATGAAAATCACATTTTTATCGTAATTGCCCTTTAAAATTCGGGGCGCCGACCCCATGTGGTCTCAAGCCCAAAGGAAGAGTGAGGCGAGTCAGTCGCGTAATGCTTAGGCACAGGATTGATTTGTCGCAATGATTGACACGATTCCGCTTGACGCTGCGTAAGGTTTTTGTAATTTTACAGGCAACCTTTTATTCACTAACAAATAGCTGGTGGAATATATGACTATCAAAGTAGGTATCAACGGTTTTGGCCGTATCGGTCGCATTGTTTTCCGTGCTGCTCAGAAACGTTCTGACATCGAGATCGTTGCAATCAACGACCTGTTAGACGCTGATTACATGGCATACATGCTGAAATATGACTCCACTCACGGCCGTTTCGACGGTACCGTTGAAGTGAAAGACGGTCATCTGATCGTTAACGGTAAAAAAATCCGTGTTACCGCTGAACGTGATCCGGCTAACCTGAAA +>gi|49175990|ref|NC_000913.2|:1545113-1545519 +ACACCGCGAATCAATCCTACCATCGTGCTCCACAGGACGATCGCTATCAGCCCTATGAGCGTTGCTTTTTGTCGTGTCATGCTCGCTGTTTTGTCTCTCTTGCCGTTAAAAATTAAGCTGAATTTTATAGCATTTTTTTAACTGGCCTGTCAGGCAGTGGTGCGTTTTTCTACCGCTATTGAGGTAGGTCAATTTGCGAAGGCGGATTATTTTGTGGCAAACAGATGTTCTTTTTGATTTCGCGCAAAAAGATTCAGAATTTTACTGTTAGTTTCCTCGCGCAGTAATACCCCTGAAAAAAGAGGAAAGCAATGGACGTCAGTCGCAGACAATTTTTTAAAATCTGCGCGGGCGGTATGGCTGGAACAACAGTAGCGGCATTGGGCTTTGCCCCGAAGCAAGCACT +>gi|49175990|ref|NC_000913.2|:4368340-4368759 +AACGCAGGAACGGCAAACACGCCAGTGCCTAATAATGACGTCGATAGCAGGCCAATGCCCTGGGCCAGCCCCAGTTCTTGTTTGAGTCCACTCATGGGTTGATGTCCGATTGCGCCCAAATTTTGGGCAACTGCGTAGATTTTCGATGGTAGCACAATCAGATTCGCTTATGACGGCGATGAAGAAATTGCGATGAAATGTGAGGTGAATCAGGGTTTTCACCCGATTTTGTGCTGATCAGAATTTTTTTTCTTTTTCCCCCTTGAAGGGGCGAAGCCTCATCCCCATTTCTCTGGTCACCAGCCGGGAAACCACGTAAGCTCCGGCGTCACCCATAACAGATACGGACTTTCTCAAAGGAGAGTTATCAATGAATATTCGTCCATTGCATGATCGCGTGATCGTCAAGCGTAAAGAAG +>gi|49175990|ref|NC_000913.2|:2631807-2632465 +AGTCACCACACCAGATTCGTGTTTTTTCACACGGCGAACTTCTTCTGCCTGGCGTTCAATGGACATGTTTTTGTGGATAAAGCCGATACCGCCTTCCTGAGCCAGAGCAATAGCCAGGCGCGCTTCCGTTACGGTATCCATTGCTGCGGAAAGCATAGGGATATTCAGACGAATAGTTTTCGTCAGCTGGGTGCTGAGGTCAGCAGTATTCGGCAGAACGGTAGAGTGAGCAGGAACGAGGAGAACGTCGTCAAACGTCAGAGCTTCTTTAGCGATACGTAGCATGGGCAATATCTCGACCAGAGTGGTTAATAAATATTGCCGCGGCATTATACAGAGCGTAACCGATTGCATCTACCCCTTTTTGCAAAAAATGCTTGCTATCCCCGAAGGGCGGGTTACTATCGACTGAATAACCTGCTGATTTAGAATTTGATCTCGCTCACATGTTACCTTCTCAATCCCCTGCAATTTTTACCGTTAGTCGCCTGAATCAAACGGTTCGTCTGCTGCTTGAGCATGAGATGGGACAGGTTTGGATCAGCGGCGAAATTTCTAATTTCACGCAACCAGCTTCCGGTCACTGGTACTTTACACTCAAAGACGACACCGCCCAGGTACGCTGCGCGATGTTCCGCAACAGCAACCGCCGGGTGAC +>gi|49175990|ref|NC_000913.2|:3151362-3151806 +CCTTGATTCGCCTGCATATTGGTCTGGAAGATGTCGACGATCTGATTGCCGATCTGGACGCCGGTTTTGCGCGAATTGTATAACATTGCCACTTTTGGACAATTTTGCAGACATTTTATTGTGAAAAGTCTTAAATTGTTGCGTCCGGGATCAAGGCGTCCCGGACGATTCAGGAGTACAATAGGCAGATAAAGGCTTAAACGCTGTTCCACAGGAAAGTCCATGGCTGTTATTCAAGATATCATCGCTGCGCTCTGGCAACACGACTTTGCCGCGCTGGCGGATCCTCATATTGTTAGCGTTGTTTACTTTGTCATGTTTGCCACGCTGTTTTTAGAAAACGGCCTGCTGCCCGCCTCATTTTTGCCAGGCGACAGCTTGTTGATACTGGCAGGCGCATTGATTGCCCAGGGGGTTATGGATTTTCTGCCTACGATTGCGATT +>gi|49175990|ref|NC_000913.2|:2907570-2908018 +TCATGATGGTCACATTGAGGCCACGGGCTTCAAGAATGGCTGCGAGGGAGGCTGCGGCAATGCCTTTACCCAGAGAGGATACGACCCCGCCGGTCACAAAAATATAGTTCGTTGTCATGCTGAACCTGAGAAGTTAGGTTGAAAAGACGATGGAATAACCAAGACGGGAAAGCAGTATACCCGAACATGACCTGTGCCACAAACTTTCATTATCCCTCCTCTTCGCCAGCGCACTATTGAAATCAGGAGTGAGAAAATAGCCCCTTTGGGGTAAATGTTTTTGACGCAAATCAAGCGCTTGTCATTTAAAAAATGACACAAATGGCGCTTGACCGCGTAATTCCCTTAGAGATCAATTTCCTGCCGTTTTACCTGTTGCCAGACTTCTTCCATTGTTTCGAGGTCAACACCTGTCATTTCCAGTCCACGCGCGGCAACAATACGCTCC +>gi|49175990|ref|NC_000913.2|:281111-281654 +GGAACGGATGTTCGCCCCGTCCTGCGAGGCGAACAAAACAAACTCAGTACGTAATGACATGGTATCTCTCGCATCCCAGGGCATAAGCGACTCCATAAACGGGTTCTTATGCCTTAGTTGTAAGTGTCTACCATGTCCCCGAACAAGTGTTCACTATGTCCCCGGACCGTACACCCCAAAGGGGAGAGGGGACTGCACCGAGCCATCTTTTCCCCCTCGCCCCTTTGGGGAGAGGGCCGGGGTGAGGGGCAATATGTGATCCAGCTTAAATTTCCCGCACTCCCTCTTCCCTTCCGATTTACCTCTCCTTGTTCTGCGTCATAGTATGATCGTTAAATAAACGAACGCTGTTCTATAATGTAGAACAAAATGATTCAGCAAGGAGATCTCATGCCGCAGTCCGCGTTGTTCACGGGAATCATTCCCCCTGTCTCCACCATTTTTACCGCCGACGGCCAGCTCGATAAGCCGGGCACCGCCGCGCTGATCGACGATCTGATCAAAGCAGGCGTTGACGGCCTGTTCTTCCTGGGCAGCGGTGGC +>gi|49175990|ref|NC_000913.2|:877254-877623 +TCATAGCTTAAAAAATATTCAGTTACAGGAAAGGTCAGGGCAGGGATTCTACAGAGTTCTGGATAAAATTTGTATCGCAATCTCATTCGCTGGCGGAGGCGAAGGAAATGTAAATTTTGTTAATTCGGCGTGAAGAATTGATCCTGGACAGCATTTTGCTCAAAAAATAGCCATACTATTTAATTGCAACAAGGCTGGGAAGAGGAGGATCGAAGTATGTTCGTTGACAGACAGCGAATCGATCTGCTGAACCGGTTGATCGACGCACGCGTTGACCTCGCCGCATACGTGCAACTGAGGAAGGCAAAAGGATACATGTCCGTCAGCGAAAGCAATCATCTACGAGATAACTTTTTTAAACTGAATCGC +>gi|49175990|ref|NC_000913.2|:33-551 +CTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGC +>gi|49175990|ref|NC_000913.2|:2890451-2890847 +TATCTCAATGATATTCCAGGTCTGGAAAACCCAACCAGCGAGGTTTTAGCAAAATGGATTTGGGATCAGGTTAAACCCGTTGTGCCGCTGTTAAGTGCGGTGATGGTAAAAGAAACCTGCACCGCAGGTTGTATCTATCGCGGCGAATGATAAGAGTGTGTCGGCGGTCAATTTCCCTTAAGTAACGCTATGTTAGGGTGTTGTGTTCTGGATATCTGGGGCATGACATGGAAGACGACTGCGACATTATTATTATTGGTGCCGGTATTGCAGGCACCGCTTGCGCGTTACGCTGCGCGCGAGCGGGTTTATCCGTTTTGTTACTGGAACGCGCTGAAATCCCCGGCAGCAAAAATCTTTCCGGCGGGCGGTTATATACCCATGCACTCGCGGAAC +>gi|49175990|ref|NC_000913.2|:815849-816563 +AGCCAGCGTACGATTGCGCATATAGTTTCCTGGAGTCAGATTATCCGCGCTACAGTAGCGCAAATTGCGGGGAAACAGCAATTAACCTGCCAATATTGAGGATACAAACTATTTTCTTCTACCTCTAAAGGACGATGCACGCTATGCCTCCCTGATGATGTATATCAAAGTAAAACCGCCATTTTCCCTTATTCTGTAGCGAAATAGCACGATCATGACGCTATATACATGATTACATAGCGAAAGTGTGGATGGTAAAAATCTCATTTACACGCTAGTATCGGCATAACCACTAAACACTCTAGCCTCTGCACCTGGGTCAACTGATACGGTGCTTTGGCCGTGACAATGCTCGTAAAGATTGCCACCAGGGCGAAGGAAGAAATGACTTCGCCTCCCGTATCTGGAAAGGTGTACATGGCTTCACAACTGACTGATGCATTTGCGCGTAAGTTTTACTACTTGCGCCTGTCGATTACCGATGTGTGTAACTTTCGTTGCACCTACTGCCTGCCGGATGGCTACAAACCGAGCGGCGTCACCAATAAAGGCTTTCTTACCGTCGATGAAATTCGCCGGGTTACGCGCGCCTTCGCCAGACTGGGCACCGAAAAAGTGCGCCTGACAGGAGGAGAGCCGTCTTTACGCCGCGACTTTACCGATATCATCGCCGCTGTGCGGGAAAACGACGCTATCCGCCAGATTGCGGTCACA +>gi|49175990|ref|NC_000913.2|:33835-34215 +ATGGCGAATATACCTACATCATCAACACCACCTCAGGCCGTCGTGCGATTGAAGACTCCCGCGTGATTCGTCGCAGTGCGCTGCAATATAAAGTGCATTACGACACCACCCTGAACGGCGGCTTTGCCACCGCGATGGCGCTGAATGCCGATGCGACTGAAAAAGTAATTTCGGTGCAGGAAATGCACGCACAGATCAAATAATAGCGTGTCATGGCAGATATTTTTCATCCGCTAATTTGATCGAATAACTAATACGGTTCTCTGATGAGGACCGTTTTTTTTTGCCCATTAAGTAAATCTTTTGGGGAATCGATATTTTTGATGACATAAGCAGGATTTAGCTCACACTTATCGACGGTGAAGTTGCATACTATCGAT +>gi|49175990|ref|NC_000913.2|:2945141-2945504 +GCGGCAAGCATTGCCACAACCGTGCCCATAAGAAGGTACTTTACCCAACGTCCTTTCATTGTTCTCTTCTTCAGGTTAAAAAATAAGGCGCAACGAAGATAACAAACCGCCGCGTTTAATGAAATGAGCAGCCTCTCCCTGACGCAAATTTTGCACAAAAAATAGGCTTTAGTGATTTGTTTTTGTTCAAAATCATGCCAAATCCGTGATCGGGGTAAAAAAAAGGTTGCATGAAAACGCGAGCGGAGTATAGTGCGCATCCACGGACGCGGGGTGGAGCAGCCTGGTAGCTCGTCGGGCTCATAACCCGAAGGTCGTCGGTTCAAATCCGGCCCCCGCAACCAATTAAAATTTGATGAAGTA +>gi|49175990|ref|NC_000913.2|:3299180-3299654 +GCCAGCTTAAGACCTGCTCAAGAACCTGGCGTTTTAGCGCAAATGGCGTCAGTTTTACCGGTACACTCAACAGAGATGGCCCCAAATGCACAATACGGGAACGCAGTTTATCCAACACGAGTTTTACTCCCTGTTTCAACAATCATCCTATTTTGCCATATCAGAAAAATAACATAGCGGTATAAATCAACAATTCCATATGAAATTGCTGCTACCACCAATACAACTTTAACTGCCTTAAATCAAAAATTGTCGCAGCAAGGTTAACTAAAATCCCAGTTCGTTAACATTTTTGCGTTTTGATAGCGCAACCTTCAGGAAAAATTATGGAGCTGCTCTGCCCTGCCGGAAATCTCCCGGCGCTTAAGGCGGCCATCGAAAACGGCGCAGATGCTGTTTATATCGGGCTAAAAGATGATACCAATGCCCGTCACTTCGCCGGCCTTAACTTTACCGAGAAAAAATTGCAGGAAG +>gi|49175990|ref|NC_000913.2|:2342437-2342800 +ACAATAACCGAATAGAAAACAACCATTTCGCCATCAACAATCTCTTACATTCGCTTATATATTGACCACAACTGATACATCAGATTATGTGATGACTCGTGCTTAGATCAATTTTTGCAATCATTAGCAAAAAGATTAATAAGCCATCTATATCAATTTATCTAACCTATTATGCCGTTCAAGAAATCGCCGAACAGTTATTTTTAACAAATTTTTCTCTTCCCATTGACTTTCCCGGACACCTTGTCTGACCTAAGGTGCGCGAAAGCCACTTTTTCCTTCCTGAGTTATCCACAAAGTTATGCACTTGCAAGAGGGTCATTTTCACACTATCTTGCAGTGAATCCCAAACATACCCCCTAT +>gi|49175990|ref|NC_000913.2|:1397353-1397981 +TTAAGTTCATCACCAGCCTTAAACAGCGTCTGGCCTTTCTGAATAGGCTTCTTCCGCTCAATGATATTATCAAGCTGATCAAGCTCATGTTCGTTGAGTGTGAACGGGATGCAAAGCTGGCTGATGCTGCAATCCTGGCAATGGATAGCACAACCGCCAGACTGAATGCGCCGTATAATTCGCTTTTCCGGGATCATAGGTCTGCTCAAGCCGTAATTGATATTTGTCAATTTTAACATCTTTTTAGGGAGCAAGTAAGTCTAAGCAAACCTTAACAGCAGAGAATTCCGATATTAGATGTAAATATATGTCTATCTATTTGAAAACCCTTAAGTTGTTAAGGGTAACTTTACATAAAAGTGTGAACAAGCTGGCACAAATTGTTTAATGTTTACAGCAAAAGATAACCTTCATGGCGCAATAACCACTCTTTTCGCTGAACTCCGCCTGCATATCCGGTCATGGTGCCGTTTCGGCCAATAACCCGATGGCAAGGTACGACGATGCTGATGGGATTCGATCCGTTTGCCGCACCAACGGCACGCGCCGCGCCAGGACGGCCCAATTGCTCAGCCAGTTGGCCGTAATGCATTACCTGCCCGCAGGGGATAGTGCGTAGTGTTTTCCA +>gi|49175990|ref|NC_000913.2|:2414804-2415235 +GCTGCAGGGTATGCGCAAGCCGGTTAACGACCTGTCCCGTGGCGCACTGGTTGACGATATCGTCTACACCATCGCGCTGACTGCGATTCAGTCTGCACAGCAGCAGTAATCTCGTCATCATCCGCAGCTTTGCGCTGCGGATATCTGAACCGGAAATAATCACTATTTCCGGTTTTTTATTCTCTTAATTTGCATTAATCCTTTCTGATTATCTTGCTTAACTGCGCTGCATCAATGAATTGCGCCATCCCACTTTGCATACTTACCACTTTGTTTTGTGCAAGGGAATATTTGCGCTATGTCCGCAATCACTGAATCCAAACCAACAAGAAGATGGGCAATGCCCGATACGTTGGTGATTATCTTTTTTGTTGCTATTTTAACCAGCCTTGCCACCTGGGTAGTTCCGGTGGGGATGTTTGACAGTCAGG +>gi|49175990|ref|NC_000913.2|:2458863-2459252 +GCTGTAAGACGCGGTGCAGTCGGAGTTGTCCATAATGGTGCCAACATCCATACAGCAGCAAACCGGGGTTTCATCAGCACTACATTTACTCATCGTTGATTTCCTCTGTATGTGCACCCAAGGTGCCAGATAAACGTTGTGGATATTTTACGCTTCCGGAAAGTGCTGCTCCAGTTGTTAATTCTGCAAAATCGGATAAGTGACCGAAATCACACTTAAAAATGATCTAAAACAAAATTCACCCGAATCCATGAGTGCGCCACCTCCAAATTTTGCCAGCTGGATCGCGTTTCTTAGATCATATTTGAAAAAAGATAGAAACATACTTGCAACATTCCAGCTGGTCCGACCTATACTCTCGCCACTGGTCTGATTTCTAAGATGTACCT +>gi|49175990|ref|NC_000913.2|:3491714-3492204 +TTCCTGTTAGGTTTCGTCAGCCGTCACCGTCAGCATAACACCCTGACCTCTCATTAATTGCTCATGCCGGACGGCACTATCGTCGTCCGGCCTTTTCCTCTCTTCCCCCGCTACGTGCATCTATTTCTATAAACCCGCTCATTTTGTCTATTTTTTGCACAAACATGAAATATCAGACAATTCCGTGACTTAAGAAAATTTATACAAATCAGCAATATACCCATTAAGGAGTATATAAAGGTGAATTTGATTTACATCAATAAGCGGGGTTGCTGAATCGTTAAGGTAGGCGGTAATAGAAAAGAAATCGAGGCAAAAATGAGCAAAGTCAGACTCGCAATTATCGGTAACGGTATGGTCGGCCATCGCTTTATCGAAGATCTTCTTGATAAATCTGATGCGGCCAACTTTGATATTACCGTTTTCTGTGAAGAACCGCGCATCGCTTATGACCGCGTACACCTCTCGTCTTACTTCTCTCACCACACCG +>gi|49175990|ref|NC_000913.2|:3351914-3353191 +AACGAGGCAACATTACAGAATGTGCAGCCACCACGCCCGATGGTACCGTCACGGTTAGGGCAGCTAAAACCGCCGTGCAGCGTCAGCTTATGCACCTTTTGCCCATAACGACGGGTGAGATCACCACCAAACATATTGACTAATTTCTGTAACTGCATAATCTGATAGACCGCGCCTTGAAAAGAGGCCAAAGCCTGCCATTTTTAGCTCAATTCGGCGATGACCTGGATCAATCGTCCCGGCCTGCTTTTATCAACTGCATAATCAATCAAAATTACCGAAATTTCATGCATAATCACATAAATCACTTTTGCTTATCTTGTGTCAGATTTTTTTATCTCCTGATGGATTTTAGGCAAAAACAGTAGCATGAAACGTCATTACCAATTAAGGCAGTATAAAATGCTGGTTTTGTCGTCAGTTCAAGGCAGGATAAGGGTTAACACACCTTTATGACAGTCAGGAATTGACTGTTTCTCTAACGACTTCCCTTTTAGCCTTAAAGATAAAATCCATTTTAATTTCAGTCATTTAATAAAGAATTTTGCGCTAAAGCACATTTCTGTACCAATAAGCTTGCCATTTGACCTGTATCAGCTTTCCCGATAAGTTGGAAATCCGCTGGAAGCTTTCTGGATGAGCAGCCTGCTCATCATATTTATGCAGTAATTGAGATCCCCTCTTCACCGTATTAACCGATGCGAAAAGGACAACAAGGGGGCGAATGCGAGGCGCGCGTATGACACGCAAACCCCGTCGCCACGCTCTTTCTGTGCCCGTGCGCAGCGGTTCGGAAGTGGGGTTCCCGCAGAGCCTGGGGGAGGTTCACGATATGTTGTACGATAAATCCCTTGAGAGGGATAACTGTGGTTTCGGCCTGATCGCCCACATAGAAGGCGAACCTAGCCACAAGGTAGTGCGTACTGCAATACACGCACTGGCCCGCATGCAGCACCGTGGCGCGATTCTCGCCGATGGTAAAACCGGCGACGGTTGCGGCTTGCTGTTACAAAAACCGGATCGCTTTTTTCGCATCGTTGCGCAGGAGCGCGGCTGGCGTTTAGCAAAAAACTACGCTGTCGGGATGCTCTTCCTGAATAAAGATCCTGAACTCGCCGCTGCCGCACGCCGCATCGTTGAAGAAGAACTGCAACGCGAAACCTTGTCGATTGTGGGCTGGCGTGATGTCCCCACTAACGAAGGCGTGCTGGGTGAAATCGCCCTCTCCTCTCTGCCACGCATTGAGCAAATTTTTGTGAACGCCCCGGCAGGCTGGC +>gi|49175990|ref|NC_000913.2|:1868318-1868683 +CATTTATTACATACATTAGCTTACAATCGCTTTAAATATGACAGCATAACCTTTACATAATTTAGTTCCAGAAAACAATCATTCGGAAAAATGATTCAGTCAACACGTATTTCCATGGGGTTATTCTTTAAATATTTTTTATCGTTAACGAAAATTGATCCTGGTCAAAACTATATATCTCTGCCATCAATAAAATCCAGCACTCACATTGCTCTCCTTTTTATGGTTTCTATGGGTACACAAAAATTAAAAGCTCAAAGCTTTTTTATTTTCAGTTTATTGCTGACGTTAATTTTATTTTGCATTACTACCTTATATAACGAAAACACAAATGTAAAACTCATCCCACAGATGAATTACCTGAT +>gi|49175990|ref|NC_000913.2|:3153009-3153427 +ATGCCCGGAAAAGAGAATTATGATGCCAGGCTCGTACATCACCGGTGTACGTGCGAAAGGCGTCTCGCCATACAACAAACGCACATCGGGCAACAGTCCTGACAAACTATTTTCTTTATTTTTCAGTTTATTAACTTTATCCGCCAGCAAGCGGCAAATCTCTTCACGTTTCATATCGCGTAATTTCTTAGGAATAATGCGGCAATTTGATTGTGCGCAATTTTGTAGCATTTCTCCAGCACTCTGGAGAAATAGGCAAGACATTGGCAGAAATGAGCATTGAGAGCCAGGGCGCTGGCGATCACAATGAAAAACATCAGGCAGATCGTTCTCTGCCCTCATATTGGCCCAGCAAAGGGAGCAAGTAATGAACAACTTTAATCTGCACACCCCAACCCGCATTCTGTTTGGTAAAGGC +>gi|49175990|ref|NC_000913.2|:1655834-1656177 +GGTAACGTCTCTGCTCCCGCGCAGTCTAGCCAGGATGGCGCGCCTGCCGAACCGCAGTAACACATTATCGACTGAACGCCGGATATGACAAATCCGGCGATTTGAACATACAACATAATCCCACCTTATTACTCATACCCTTCTATTGATATGGATTAATAATTCTTAACCCAAAATGGGTAGACTCCCTCTATTGTTAGCGCGCTAAATATTCAATATATAAACTTTTATATAACGATAAAGAACAGGGAGTGAGTTATGTCCAAAAATGAACGAATGGTGGGCATCAGCCGCAGAACACTCGTTAAATCTACCGCGATAGGTTCTCTGGCGCTGGCTGCAG +>gi|49175990|ref|NC_000913.2|:1717567-1718028 +ACATCAACACCATCAAGGCTGGTGCCTGACATAACGCCAATAAAGCGGCCCGATTTCATAGTTCATCCTTTTTCAATCTGACGTTTGCGCACCACTCAAACATAAACTTTTCGTGAATACCATGCGGAATGACCGATTTTTACCGTTGGTAGTAAAACATTATCTTCAAATCAATAATCATCATGAATGTTTTGTTTATAATTGGTTGATCCTACTTTCATTATGATTTGCTCATATTTGGTAGAACATGTAACCATGGATTCACATATGCCATATACTTTGACCATGAGGGATGCTTGCGTGGCGTTTCATGGTGAACAGGAGATTTTTCAATGATTAAACGCGTATTGGTTGTTTCAATGGTAGGTCTGTCTCTTGTCGGTTGTGTTAATAACGACACCCTGTCAGGGGATGTTTATACCGCTTCTGAAGCGAAACAAGTACAGAATGTCAGCTATGGC +>gi|49175990|ref|NC_000913.2|:1752520-1752841 +CAAACACAGTGCTGACAATTCAGCAGCCTGATATTAAGCATTCAACTAATACGCTGCTGACTCTGTCCCCAACACCAGATATTTTCCCTAACCCGATAGTGTATGAGAAAAGATTAACGCCGATAAACTAATCCTTTATGGTTAGGAGAAAAAATAATGTGATTATTCTGAGCCCTTAACATTGATCGTTATCAATTAAAATTACAAAGCAAGAAATTATAAGTGAACTGATATTTATTATCATTTGAAATAAATTTAACTTAAGCGACAATTATTTGTGACTTTCATTGCAGAAAATAAAAATAACATTATCGCTATAAA +>gi|49175990|ref|NC_000913.2|:2377126-2377533 +TGGGTGTGGTGAATGAATGCGGAATTCTCCGCCGCCGCTAACGTTAACGGTGTAGAACGCGAGCCTGGGGCGATACAGATGTGTCTGACGCCGTGACGCGTTAATGCTTCCAGAATGACCGCCGCCCAGCGTCGGTTAAATGCGCTTACTGACATGAGTTTGTCCGGTATCAATATTGCGGCTAAGTATAAGGAGCTACAAAAATAGATTATTGATATGAATCGGTAATGATGCGACTCATTACTATTCCATTTGTAATAAAGTACGCAGCCCTGCCGCTTTGTTGTCGATTTCCTGCCACTCTTGCTCGGGGTCGGAACCACGGACAATGCCCGCGCCAGCATATAATCGCACGACATTGCCGCTAATTTTTGCTGAGCGCAGGGAAACGCAGAATTCGCTTTGTT +>gi|49175990|ref|NC_000913.2|:4638327-4638790 +ATGTTTGCTACCTAAATTGCCAACTAAATCGAAACAGGAAGTACAAAAGTCCCTGACCTGCCTGATGCATGCTGCAAATTAACATGATCGGCGTAACATGACTAAAGTACGTAATTGCGTTCTTGATGCACTTTCCATCAACGTCAACAACATCATTAGCTTGGTCGTGGGTACTTTCCCTCAGGACCCGACAGTGTCAAAAACGGCTGTCATCCTAACCATTTTAACAGCAACATAACAGGCTAAGAGGGGCCGGACACCCAATAAAACTACGCTTCGTTGACATATATCAAGTTCAATTGTAGCACGTTAACAGTTTGATGAAATCATCGTATCTAAATGCTAGCTTTCGTCACATTATTTTAATAATCCAACTAGTTGCATCATACAACTAATAAACGTGGTGAATCCAATTGTCGAGATTTATTTTTTATAAAATTATCCTAAGTAAACAGAAGGATAT +>gi|49175990|ref|NC_000913.2|:854964-855325 +CATGTTAAGCGAGATTTTGTGCCTGGCAGGCCATTGGGTTGAGAATATTAGAGTATTGGAAGCGCATTATAAGTTCATTCCAGCTCACAGTGAAATCAGATGTGTACGAAATCACATTTTTTGCCTTTGGCTTGAGTGTAGACCTTAAGCGAGGAGCAGGATCTTCTTTCAGACTTATGGCATAATGCGCGGTTTGTCATATCTCTTTTCAGGATACGCCTGTGTTAGTTTCCAGTAACGTCACCATGCAGTTCGGCAGTAAGCCGTTGTTTGAAAACATTTCCGTCAAATTTGGCGGCGGCAACCGTTACGGCCTGATTGGCGCGAACGGTAGTGGTAAATCCACCTTTATGAAGATCCT +>gi|49175990|ref|NC_000913.2|:1608621-1608960 +CGCAGGAAGTTACCTTCACCAAACTGAATGATACGTTCTGGATACTGTGCACCGGGAAAATCGCGACGATTTAGTGTTTTCACAATGGGTTCCCTTCTGATTAGTCATACAACCTGTTTGAATTGGTACGACAGGTTAGCAAACTTTAATACGCCGAACCCCTGTTTTGATCAACTCCTGATGATTAATGAGCAGTTTTATGAGAAAAGTGTGGCGCGGATCATGGTTTAATCGAGGAAAAACGCCTTTTCCTGGATCATAAAGTGGTAGAACACATTGCATTCAAATCGCGCGTAATGAATAAAGATGTCAGACAACTTCCTCACCGTAACGCATAGT +>gi|49175990|ref|NC_000913.2|:1763017-1763331 +AGTAAGCAGGTCAGCAAATTCAAGTTCTAGTTGTTTCAGACGTTTGAGATATTGGGCAGGTGAAAGATTGCTCTGGTCACGGCGTAAAAATTCAATGGCCAGCTGGGTGGGATCAAGTTGAGTAGACATAGCATCCTCGCTTTTAGACAAGACCTGCACAGTATACCACCGTTTACTGTGCAGATAATGACCAAAAGCAATATGCGTCACACTTTTCTGGTGACAACGTCACAAAATGGCGGTCGTCAATCGTGACGAACAGCACAAACGCCCTTTCTCATCGAAGATTTCAATCTGCCAGACCTGGTGACGCG +>gi|49175990|ref|NC_000913.2|:1989697-1990222 +TATCAGAGATACTTTTTGAGTGGCTTTGCTGGTGATTAAAAATTAAGGAGGGTGTAACGACAAGTTGCAGGCACAAAAAAACCACCCGAAGGTGGTTTCACGACACTGCTTATTGCTTTGATTTTATTCTTATCTTTCCCATGGTACCCGGAGCGGGACTTGAACCCGCACAGCGCGAACGCCGAGGGATTTTAAATCCCTTGTGTCTACCGATTCCACCATCCGGGCTCGGGAAGAAAGTGGAGGCGCGTTCCGGAGTCGAACCGGACTAGACGGATTTGCAATCCGCTACATAACCGCTTTGTTAACGCGCCAAATTCTTCAGGCCTTTCAGCCAGACATCCGCTTGACGCCGATGTCTTTTAAACTGGAGCGGGAAACGAGACTCGAACTCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTATTCCCGCATTCATCAAGCAATCAGTTAATCACTTGATTTTATTATCGTCTGGCAATCAGTGCCGCCGTTCGATGCGTTGCATTCTACT +>gi|49175990|ref|NC_000913.2|:4402562-4403145 +CGGGCGTTGTGGTCTACTACATGTTGAGGAAAACGATTGGCTGAACAAAAAACAGACTGATCGAGGTCATTTTTGAGTGCAAAAAGTGCTGTAACTCTGAAAAAGCGATGGTAGAATCCATTTTTAAGCAAACGGTGATTTTGAAAAATGGGTAACAACGTCGTCGTACTGGGCACCCAATGGGGTGACGAAGGTAAAGGTAAGATCGTCGATCTTCTGACTGAACGGGCTAAATATGTTGTACGCTACCAGGGCGGTCACAACGCAGGCCATACTCTCGTAATCAACGGTGAAAAAACCGTTCTCCATCTTATTCCATCAGGTATTCTCCGCGAGAATGTAACCAGCATCATCGGTAACGGTGTTGTGCTGTCTCCGGCCGCGCTGATGAAAGAGATGAAAGAACTGGAAGACCGTGGCATCCCCGTTCGTGAGCGTCTGCTGCTGTCTGAAGCATGTCCGCTGATCCTTGATTATCACGTTGCGCTGGATAACGCGCGTGAGAAAGCGCGTGGCGCGAAAGCGATCGGCACCACCGGTCGTGGTATCGGGCCTGCTTATGAAGATAAAGTAGCACGTCGCG +>gi|49175990|ref|NC_000913.2|:1278723-1279216 +CATGATGTCGCGCTTTTTTTATGCGTCATTTAGTTACAACATACTAATGTTATATGGTTTATTTCGCCGGATTTCATTAAGAGCCATTAATATGTTACCCATGGGGAATACTCCTTAATACCCATCTGCATAAAAATCTTAATAGTTTAAATAACTACAGGTATAAAACGTCTTAATTTACAGTCTGTTATGTGGTGGCTGTTAATTATCCTAAAGGGGTATCTTAGGAATTTACTTTATTTTTCATCCCCATCACTCTTGATCGTTATCAATTCCCACGCTGTTTCAGAGCGTTACCTTGCCCTTAAACATTAGCAATGTCGATTTATCAGAGGGCCGACAGGCTCCCACAGGAGAAAACCGATGAGTAAATTCCTGGACCGGTTTCGCTACTTCAAACAGAAGGGTGAAACCTTTGCCGATGGGCATGGCCAGCTTCTCAATACCAACCGTGACTGGGAGGATGGATATCGCCAGCGTTGGCAGCATGACA +>gi|49175990|ref|NC_000913.2|:2066335-2066664 +TTACATTATCCACGCCAAAGTATTTGTCATCACAATGATGGTACCTTCTTTCAGACACCATTTTTTCAACTCCGTTTTCCACGGACCGCACTCTTATGTCAAGAGTGCGGTCCGTGGATACAACCAGAGACCGACTGACACGAGTCAGAGGAAACGACGGATATGTTCAGTCGTAAAATATCTATCAAAAAACATGATTAAGGTCAAAAATGTTTGATATTTACAATTTATGAAGATGACAATAATTATAGATATATGAGAACATAAATGAAAATAATTATCATTACAGTAATCATTTGTACTTTGTATTAATGAGGGATGAAATGTTA +>gi|49175990|ref|NC_000913.2|:3980277-3980948 +CATAAACTGGAGGAATAAGCAGCAAAACGCACAAACCGTAACCAAACGCGCAATTTATTTAAAAAGGGACTAGACAGAGGGGTGGGAAGTCCGTATTATCCACCCCCGCAACGGCGCTAAGCGCCCGTAGCTCAGCTGGATAGAGCGCTGCCCTCCGGAGGCAGAGGTCTCAGGTTCGAATCCTGTCGGGCGCGCCATTTAGTCCCGGCGCTTGAGCTGCGGTGGTAGTAATACCGCGTAACAAGATTTGTAGTGGTGGCTATAGCTCAGTTGGTAGAGCCCTGGATTGTGATTCCAGTTGTCGTGGGTTCGAATCCCATTAGCCACCCCATTATTAGAAGTTGTGACAATGCGAAGGTGGCGGAATTGGTAGACGCGCTAGCTTCAGGTGTTAGTGTCCTTACGGACGTGGGGGTTCAAGTCCCCCCCCTCGCACCACGACTTTAAAGAATTGAACTAAAAATTCAAAAAGCAGTATTTCGGCGAGTAGCGCAGCTTGGTAGCGCAACTGGTTTGGGACCAGTGGGTCGGAGGTTCGAATCCTCTCTCGCCGACCAATTTTGAACCCCGCTTCGGCGGGGTTTTTTGTTTTCTGTGCATTTCGTCACCCTCCCTTCGCAATAAACGCCCGTAATAACTCATTGCCCCACGGTATGATTTCGCCCTTAACG +>gi|49175990|ref|NC_000913.2|:2716408-2716734 +GGCCACCGTCAGCACGAGATTTTCCTGGACGCCCGCCCCCTTTCCCGGTACGCGGGTTGTGGGTACGTTTATCAGAATCATCATCACTGCGGACATACATCACTTTGACCTTGCCGCTTTTACCTTTCATTTCATCGTTCATGCTTTTCTCCACCAGCGCTGCGCGAAGCGCGCAGATTACCCGAAGTCCGCGCGGTTCGCCATGATTTCGTACCAAAGCCTGCGACTATCATACCTATTGAATAAAACAGATTGTTGTCTGGAACAATGTCCCCGATAATATGTAACATATTAGAAACATACCGGCGTCGTTGCCGATAAGTCTC +>gi|49175990|ref|NC_000913.2|:3783037-3783439 +TGGGGGGTAACAACTCCCGACGTAGTGATTAAAAAAACCAGGAAAGGAGTATACCTGCGCGGTGCGGCAAATACAGCCAGCGCGTTAACTGGAATGCAATTTTGCGGGGCGCGACGAAAAAAAAGCGTAAATCTGGCATCGGCTTGCCGCGAGCGGTCGTATATTTTGATCTTCAGAGGCTATTTTATCGATTCAGCTGTAGTAAAATTACGCAAATTTTGACTCTTGAGTATGAGGTTGTCGCAATGTTGGTTTCTAAAAAACCTATGGTACTGGTGATTCTGGATGGCTATGGCTATCGCGAAGAACAGCAGGATAACGCCATTTTTAGTGCTAAAACCCCGGTAATGGATGCACTGTGGGCCAATCGTCCGCATACCCTAATCGACGCTTCCGGTCTGG +>gi|49175990|ref|NC_000913.2|:4460674-4461165 +CGGTGTCATGATCGCTCCATATTTTTAAGAACAGGTTATCCACAGAAATTGGGAAAGGCGTTTTCCGGTTTTTTCGTTTCGTTTTCCGGTGCTTTCCGCAATCGCCATCCGCTTATCCACAACGCCGCACCTTTTTCGTGGGGAGCTGTAGTAGCAATTATAGTCGATTAATACAACATATTGGGTTGGGACGCATTTTAAAGTCTATATATAGTGCTTTGCATCAAGGATGTTTGAGCTTTTTTTGATGTAGCTCAAAGTAAAAAGCAGAGCGTACGGATGACGGGCGCTACAGCGATATGTAAATTTTTTAATGAATTTGCTGGTTGAAAAATCAACAAAAACAACATACTGACAGACAAAACCCCGGAATGACCGGCATTACCGGGGCTTAGGGAAGATTTACTTCTGTAACCACCAGACAGCCTCAAAAGGCCGTAAATTCATGGCACAGGGTTGTGGTGAGGCTTCTTCGTAGTTATGCATCACAA +>gi|49175990|ref|NC_000913.2|:2531315-2531828 +GTGGTTATTCTACCATCATCGGGTGAGCGTTATTTAAGCACCGCATTGTTTGCCGATCTCTTCACTGAGAAAGAATTGCAACAGTAATGCCAGCTTGTTAAAAATGCGTAAAAAAGCACCTTTTTAGGTGCTTTTTTGTGGCCTGCTTCAAACTTTCGCCCCTCCTGGCATTGATTCAGCCTGTCGGAACTGGTATTTAACCAGACTAATTATTTTGATGCGCGAAATTAATCGTTACAGGAAAAGCCAAAGCTGAATCGATTTTATGATTTGGTTCAATTCTTCCTTTAGCGGCATAATGTTTAATGACGTACGAAACGTCAGCGGTCAACACCCGCCAGCAATGGACTGTATTGCGCTCTTCGTGCGTCGCGTCTGTTAAAAACTGGCGCTAACAATACAGGCTAAAGTCGAACCGCCAGGCTAGACTTTAGTTCCACAACACTAAACCTATAAGTTGGGGAAATACAATGTTCCAGCAAGAAGTTACCATTACCGCTCCGAACGGTCTGC +>gi|49175990|ref|NC_000913.2|:1156803-1157201 +TTTTTTAAAGCTCGTAATTAATGGCTAAAACGAGTAAAGTTCACCGCCGAAAATTGGGCGGTGAATAACCACGTTTGAAATATTGTGACATATGTTTTGTCAAAATGTGCAACTTCTCCAATGATCTGAAGTTGAAACGTGATAGCCGTCAAACAAATTGGCACTGAATTATTTTACTCTGTGTAATAAATAAAGGGCGCTTAGATGCCCTGTACACGGCGAGGCTCTCCCCCCTTGCCACGCGTGAGAACGTAAAAAAAGCACCCATACTCAGGAGCACTCTCAATTATGTTTAAGAATGCATTTGCTAACCTGCAAAAGGTCGGTAAATCGCTGATGCTGCCGGTATCCGTACTGCCTATCGCAGGTATTCTGCTGGGCGTCGGTTCCGCGAATTT +>gi|49175990|ref|NC_000913.2|:1831150-1831625 +AGCCAACCGCTGGCAGAATGTTGCGTAGAAACCGGCATCGACTATGAAATCCGCAGCATGGAAAAACCGTCCGATCACGCCCCCGTCTGGGCGACCTTCCGCCGCTAATTTAGCAGCTCTCCTGGCTCAAACTGGGTCAGGAGAATTAACCTTGAGAAAAATCAACAAACTGTCAGTAATGATTTGTTGCCTGCCGTCCTTTGTTATACCGTCTCTGCGTTTTTAGTTGTCTGACCACTTCTCTATTATCAAGTTTGATATAGGAAACTCCACGATGAACGCTGAGCGTAAATTTCTTTTTGCCTGTCTTATTTTTGCGCTGGTCATTTACGCTATCCACGCTTTCGGTTTATTCGATCTGCTCACCGATTTACCCCACTTACAGACACTCATCCGCCAGAGCGGATTTTTCGGCTATAGCCTCTATATTCTGTTATTCATCATTGCCACCCTCTTGCTGTTACCAGGAAGCATA +>gi|49175990|ref|NC_000913.2|:1837270-1837624 +TCTGCGCGATAATTTTCGCCAGTGGGTGTTCAGCGAAGTTCGCGCCCTGGCGATCCCCGTCGTTCAGGTAACGCACGATCTCCAGGATGTTCCTGCTGATAGTTCTGTTCTGGATATGGCGCAGTGGTCAGAAAATTACAACAAACTGCGATAACGCAAAGTTTTTCTCAATGCGTCAGTTCAGAATGGCGCACTCAAAACTACAATGTCGGGATTTTCGATGAAACGTGTTTCTCAAATGACCGCGCTGGCAATGGCTTTAGGGCTGGCTTGCGCTTCTTCGTGGGCCGCTGAACTGGCGAAGCCTCTTACACTTGACCAGCTTCAACAACAAAATGGCAAAGCGATAGATAC +>gi|49175990|ref|NC_000913.2|:939792-940214 +AACCCGTCTGGTTCATACCCTGAACGGTTCTGGTCTGGCTGTTGGTCGTACGCTGGTTGCAGTAATGGAAAACTATCAGCAGGCTGATGGTCGTATTGAAGTACCAGAAGTTCTGCGTCCGTATATGAACGGACTGGAATATATTGGCTAATACCCAATTTTTCTGAATCTAAAAAGCGCCTGCGGGCGCTTTTTTTGTCTCCCTTTGATACCGAACAATAATTACTCCTCACTTACACGTAATACTACTTTCGAGTGAAAATCTACCTATCTCTTTGATTTTCAAATTATTCGATGTATACAAGCCTATATAGCGAACTGCTATAGAAATAATTACACAATACGGTTTGTTACTGGAATCAATCGTGAGCAAGCTTGAGTGAGCCATTATGAAAACGAAAATCCCTGATGCGGTATTGGCT +>gi|49175990|ref|NC_000913.2|:3572851-3573278 +AGAACGGAGCCGACCATACCGCGCCAGCCGATAAAACCAACATTTTTCATAAGCGTTTTTTTCCTGCAAAGATGTGTGCTGTATAAATGTGCCGGTCTCCTCTTGGCACATCTTTCACCATACAAAAAGCAGCCAAAGTCGCAAGTGAAATTAATCAATGATAGCGAAGCCATCAGTAATGCGACTTATCCTGCTTTGTTAGCACGCAGAAAGTCCGCGGCAATTATCAGGGAATTTGAGTTATGAATGAAATCATTTCTGCAGCAGTTTTATTGATCCTGATTATGGATCCGCTCGGAAACCTACCTATTTTCATGTCCGTACTGAAACATACTGAACCGAAAAGACGGCGGGCAATCATGGTGCGAGAGTTGCTTATTGCTCTCCTGGTGATGCTGGTGTTCCTGTTTGCGGGTGAGAAAATTCT +>gi|49175990|ref|NC_000913.2|:4347133-4347390 +GCGGGTGACATAATAGTTAATTAACTTTTGTTAGCGTTTTGAAATTAAAAACACCGTTCACCTGAAGAGATATTAATTTTTAGCGATGATGGAGGGATAATTATATTTGATCTGGCACAAGTTTTACTGATGAAGGATGTAACTTGTGCCAGGGGTATTTTGCATTACGGTAATAATTATTACTTACAGATAACCAGCGATCCGTTATTGGCAATATTGTTTCAGTAGTGAGTAGTGTTCTGCCTGAATACGGTAAC +>gi|49175990|ref|NC_000913.2|:400169-400458 +CACAAAGTGCATACATTACCACGACAAAACGGGGGATTCGCGGCCTTCTGAAAGATTGTTGCAATCTTCTGCTGACAAAGCGTGCAACGTACTGGTGAAGAAAGTGCGTTATCTCAAAGATGTGCGCAAGATCACAAAAATGATGAACGGGAAGCTAATTTATTCCTGGCTTAAATGGCCATGCGGTGAGTTTTTTTCTCTTAATTATAAGTTAACGAAGAGAATATATTTCATAACTTTTATTTATAATAAAGGTTGATAATTAAAAGCCTATATTTTGTGTGGGTAA +>gi|49175990|ref|NC_000913.2|:912994-913345 +CTGCCGGAGTACGGATAGTTTGTTCACATTGCACACAAAACATGATCACACCTTTTAAAGTTATATTTAATATACATGTTTAAGGTTAAGACGCTTAACGCGGGGATAAAAGGGATTTTTCATGCAACTTTAAGGGAGATTGATTTAGCGCAATTTTGGCGGCAGGGATCTACCGCCAGAGAGGTATTACGCAGAGAAAAAGGCGATGAGGATCGGCACTAACAGGCTAAGAATAAAACCGTGAACAATTGCCGCCGGGACCATATCCAGCCCGCCAGTACGTTGAAGAACGGGCAGGGTGAAATCCATTGATGTGGCACCGCATAAGCCCAGTGCAGTAGAGCGGCTGCG +>gi|49175990|ref|NC_000913.2|:3315951-3316485 +ATGCCAACCAGTTCAAAACCCAGGGCCTCAACTGGCGCAGTAATCATCTCTGTTAATTTTTGCTCTAATGTGGACAAGCCCACCCCCAAGACATAAAAAAAGGGCCTAAAGCCCAGTTATTCTGTAGTCAGATAACAAAAAACCCCGATAAATCGGGGCTTTATATAACTGAACCCTATAACCGCAACTGCGGTCTGGAGCACTTTCCAGAAGGATTTTTTCAAATCCCACTACGAAGGCCGAAGTCTTCACAGTATATTTGAAAAAGGACTCTAAGGGAAAGTGGTTGCGGGGGCCGGATTTGAACCGACGATCTTCGGGTTATGAGCCCGACGAGCTACCAGGCTGCTCCACCCCGCGCCTGAAACGTGGCAAATTCTACTCGTTTTGGGTAAAAAATGCAAATACTGCTGGGATTTGGTGTACCGAGACGGGACGTAAAATCTGCAGGCATTATAGTGATCCACGCCACATTTTGTCAACGTTTATTGCTAATCATGTGAATGAATATCCAGTTCACTTTCATTTGTTGAA +>gi|49175990|ref|NC_000913.2|:3635440-3635848 +TCCTCTTAGAAAAAACGGGCGTGAATTGTAAAGATTCCTCAGTGGTCGCACCAGCATCAATATTACTAAAAGGAAGTATTTGCCTGAATTATATAAGATAATTATTTTTTGAGTGAAATCCATACAGGGGGCAAATCAAAAAAAGTCTATATTTCACTTTGCCCGCGCCGCGAAAGTCACTGATAATGCGCCGCGTTCATGTCCTCAAAATGGCGTAACGTCCTATGCTACATTTGTTTGCTGGCCTGGATTTGCATACCGGGCTGTTATTATTGCTTGCACTGGCTTTTGTGCTGTTCTACGAAGCCATCAATGGTTTCCATGACACAGCCAACGCCGTGGCAACCGTTATCTATACCCGCGCGATGCGTTCTCAGCTCGCCGTGGTTATGGCGGCGGTATTCAACT +>gi|49175990|ref|NC_000913.2|:4131538-4132012 +CTTTTGTGAAAATCACACAGTGATCACAAATTTTAAACAGAGCACAAAATGCTGCCTCGAAATGAGGGCGGGAAAATAAGGTTATCAGCCTTGTTTTCTCCCTCATTACTTGAAGGATATGAAGCTAAAACCCTTTTTTATAAAGCATTTGTCCGAATTCGGACATAATCAAAAAAGCTTAATTAAGATCAATTTGATCTACATCTCTTTAACCAACAATATGTAAGATCTCAACTATCGCATCCGTGGATTAATTCAATTATAACTTCTCTCTAACGCTGTGTATCGTAACGGTAACACTGTAGAGGGGAGCACATTGATGAGCACGTCAGACGATATCCATAACACCACAGCCACTGGCAAATGCCCGTTCCATCAGGGCGGTCACGACCAGAGTGCGGGGGCGGGCACAACCACTCGCGACTGGTGGCCAAATCAACTTCGTGTTGACCTGTTAAACCAACATTCTAATCG +>gi|49175990|ref|NC_000913.2|:3948260-3949925 +CGGCCAAAAAATATCTTGTACTATTTACAAAACCTATGGTAACTCTTTAGGCATTCCTTCGAACAAGATGCAAGAAAAGACAAAATGACAGCCCTTCTACGAGTGATTAGCCTGGTCGTGATTAGCGTGGTGGTGATTATTATCCCACCGTGCGGGGCTGCACTTGGACGAGGAAAGGCTTAGAGATCAAGCCTTAACGAACTAAGACCCCCGCACCGAAAGGTCCGGGGGTTTTTTTTGACCTTAAAAACATAACCGAGGAGCAGACAATGAATAACAGCACAAAATTCTGTTTCTCAAGATTCAGGACGGGGAACTAACTATGAATGGCGCACAGTGGGTGGTACATGCGTTGCGGGCACAGGGTGTGAACACCGTTTTCGGTTATCCGGGTGGCGCAATTATGCCGGTTTACGATGCATTGTATGACGGCGGCGTGGAGCACTTGCTATGCCGACATGAGCAGGGTGCGGCAATGGCGGCTATCGGTTATGCTCGTGCTACCGGCAAAACTGGCGTATGTATCGCCACGTCTGGTCCGGGCGCAACCAACCTGATAACCGGGCTTGCGGACGCACTGTTAGATTCCATCCCTGTTGTTGCCATCACCGGTCAAGTGTCCGCACCGTTTATCGGCACTGACGCATTTCAGGAAGTGGATGTCCTGGGATTGTCGTTAGCCTGTACCAAGCACAGCTTTCTGGTGCAGTCGCTGGAAGAGTTGCCGCGCATCATGGCTGAAGCATTCGACGTTGCCTGCTCAGGTCGTCCTGGTCCGGTTCTGGTCGATATCCCAAAAGATATCCAGTTAGCCAGCGGTGACCTGGAACCGTGGTTCACCACCGTTGAAAACGAAGTGACTTTCCCACATGCCGAAGTTGAGCAAGCGCGCCAGATGCTGGCAAAAGCGCAAAAACCGATGCTGTACGTTGGCGGTGGCGTGGGTATGGCGCAGGCAGTTCCGGCTTTGCGTGAATTTCTCGCTGCCACAAAAATGCCTGCCACCTGTACGCTGAAAGGGCTGGGCGCAGTAGAAGCAGATTATCCGTACTATCTGGGCATGCTGGGGATGCACGGCACCAAAGCGGCAAACTTCGCGGTGCAGGAGTGTGACCTGCTGATCGCCGTGGGCGCACGTTTTGATGACCGGGTGACCGGCAAACTGAACACCTTCGCGCCACACGCCAGTGTTATCCATATGGATATCGACCCGGCAGAAATGAACAAGCTGCGTCAGGCACATGTGGCATTACAAGGTGATTTAAATGCTCTGTTACCAGCATTACAGCAGCCGTTAAATCAATGACTGGCAGCAACACTGCGCGCAGCTGCGTGATGAACATTCCTGGCGTTACGACCATCCCGGTGACGCTATCTACGCGCCGTTGTTGTTAAAACAACTGTCGGATCGTAAACCTGCGGATTGCGTCGTGACCACAGATGTGGGGCAGCACCAGATGTGGGCTGCGCAGCACATCGCCCACACTCGCCCGGAAAATTTCATCACCTCCAGCGGTTTAGGTACCATGGGTTTTGGTTTACCGGCGGCGGTTGGCGCACAAGTCGCGCGACCGAACGATACCGTTGTCTGTATCTCCGGTGACGGCTCTTTCATGATGAATGTGCAAGAGCTGGGCACCGTAAAACGCAAGCAGTTACCGTTGA +>gi|49175990|ref|NC_000913.2|:3929037-3929416 +GGCTACGTTCATAAAGCCCCTTTTCCAACGAACTGCTCAGGCGGGAAATTCTTTCCGCTAATAAATGAGGGTGAGCCATAATGAAGTGGCGTCCTTTCGTCAAAAGTTCTGCGTAAATTGCGAGTATAGACGTTTCTTGCTGGTGGCTAAAATAGTCTCAAAGGGGGGGTATTTTTCTTTGAGCCAGGTTAATGTGGCCGCATTTAGGAGTACGATTTTGCCGTTAATCGTGCATACTGTGCGCTTTTTTGTGGGCCAAGGGACTAAGCACACATTTCATATTTCAACGAAAGACTAGTCTATGAGCACTGATAATAAGCAATCATTGCCCGCGATTACCCTCGCGGCGATTGGAGTTGTCTACGGCGATATTGGTACC +>gi|49175990|ref|NC_000913.2|:2296101-2296470 +GCAGCTAATGCAGAACTCTTCGGTATTGGCTTTTTCCATCCCGGTGTTAAAGCCACCCCAGAAGACGATGCCGCCAACAAAACCGATCAACAGCAGCGTCCCCAGCGCCAGACGGCTGGGGGTACGCCACCATTTCCACAGGCGCTTAATCAGACCAGGCTTACGGTCAGAATTTCCCATAATAACCTCTTATTTCCCGTAACCTTTTGATGGGGTAAAGGTATTCCCCACGATTGGCGCGGTATCGGCCTGCGGTACGTGACATTGCAGACAGAAATAACGACGCGGAGCCACTTCCGCGCCCACTTTGCCGTCGCTGTCCATAAAGTGAGTAGGACTGATACGCGGCGCGCCAGTGGTGCGATAGCT +>gi|49175990|ref|NC_000913.2|:3279597-3280042 +GGCTACATGTGCAACAAATCTGGATTCAGGGTCAATTAGCTTCGTTTTGATAGTTTGCTCCTTTATTGGGCCTTCACTTCCCCCGTAAGGCCTTTCTTTTTCTTTCGTTTTGATCTGTGCAGCGGTGTCGGATGCGACGCTAACGCGTCTTATCCGACCTACAGTTGGTGACCGCAAGGCCGGATAAAGCGTTTGCGCCGCATCCAGCAATCCCTTTTGCTTCCTTTATCTTTTCTTTCAACGATCACAAATTTCGTTTTATTTCTTTTTTCTCCATTGAACTTTCAGTTTCTTTTCTATAGATTTTAATCAACGAAAGACATCACCAAGTGAAATGAAACGAAAGGCAAGTGAAAGCGACAACGCCCGACGTCAAGTTCATCAGACTAAGGATTGAGTTATGCCAGAAAATTACACCCCTGCTGCCGCCGCAACCGGTACATGG +>gi|49175990|ref|NC_000913.2|:4285540-4285890 +CAACAAAAGTTGATTAATTCTTTGAGGAACATGCAGTTATGCATGCTGTTGAAAAAGAGGAAGATACTGACTAACTCTAAAGTGGTATTTTACATGCACTTACAATTGATTAAAGACAACATTCACAGTGTGGTTATTTGTTACACATAGGGGCGAGCAATGTCATGACAGTGTAGGTGCGGTTACTGTCGTGAAAAACAATAAAAACCGCCATTGCAACAATGGCGCAATTCGGATGAAGCCCCTATGACAAGGATAAAAATAAACGCACGCCGTATCTTCAGCTTATTGATTCCTTTTTTCTTTTTCACTTCTGTTCACGCTGAACAAACGGCTGCTCCCGCAAAACC +>gi|49175990|ref|NC_000913.2|:4047874-4048147 +ACAAAAAGTGCTTTCTGAACTGAACAAAAAAGAGTAAAGTTAGTCGCGTAGGGTACAGAGGTAAGATGTTCTATCTTTCAGACCTTTTACTTCACGTAATCGGATTTGGCTGAATATTTTAGCCGCCCCAGTCAGTAATGACTGGGGCGTTTTTTATTGGGCGAAAGAAAAGATCCGTAATGCCTGATGCGCTATGTTTATCAGGCCAACGGTAGAATTGTAATCTATTGAATTTACGGGCCGGATACGCCACATCCGGCACAAGCATTAAGG +>gi|49175990|ref|NC_000913.2|:862637-862959 +GACACCAGCGGCAGCCCAGCGAACAGCCTTTAAGAAATACGACCGTGCGGATACCGGGGCCATCATGGGTCGAGTAGCGCTGAATATTGAAAATCATAGTTGCCTCTCTATTTCGTTCAAGCATTAAAATACTTTCGAATGAAAGTTAGATTGATGTGCGTCAACTGTTCAGAGAGTTTTCCCGTGATAGTCTACATTCAGACAAAAAGTACATTTTGAGGATGGTTATGGAACTGTATCTGGATACTTCAGACGTTGTTGCGGTGAAGGCGCTGTCACGTATTTTTCCGCTGGCGGGTGTGACCACTAACCCAAGCATTAT +>gi|49175990|ref|NC_000913.2|:4566554-4566922 +CTTGTCCGGAGAACCCCAACGGGGAAAACGAAATTCAATACCGTCGGTAATGACGAAAGGCAGGTGAGAAGGGCAACGAGACTGCAAATGTACTGAATTTCATGTAATCAACCACACTGCCTGTGAACACACAGACAGAGAAATACTTTCTGACTTCGGGTCAGAATGTATAACGATTACAAATAGTCATCAGCACATTACACCTCTTGAATTGATGGGTTGCATAGAAAATATACACCTTAAGTGTAATTAAAATTTGCGGTTAATCAAAAGAGCGCGGTGGAAAGGGGAATATCTGCCGGGTACAGGACATGTAAAACGCGAGCTTGTTTCCGGAAAAATGTATCGTTTGCAGAAGAATAAAAATA +>gi|49175990|ref|NC_000913.2|:1276979-1277339 +TAGTCGCTTTACATCGGTAAGGGTAGGGATTTTACAGCACCGTGAAAAATCTCATAATTTTTATGAAGTCACTGTACTCACTATGGGTAATGATAAATATCAATGATAGATAAAGTTATCTTATCGTTTGATTTACATCAAATTGCCTTTAGCTACAGACACTAAGGTGGCAGACATCGAAACGAGTATCAGAGGTGTCTATGAGTCACTCATCCGCCCCCGAAAGGGCTACTGGAGCTGTCATTACAGATTGGCGACCGGAAGATCCTGCGTTCTGGCAACAACGCGGTCAACGTATTGCCAGCCGCAACCTGTGGATTTCCGTTCCCTGTCTGCTGCTGGCGTTTTGCGTATGGATGT +>gi|49175990|ref|NC_000913.2|:29479-29773 +GAGCCACAAAATAATATAAAAAATCCCGCCATTAAGTTGACTTTTAGCGCCCATATCTCCAGAATGCCGCCGTTTGCCAGAAATTCGTCGGTAAGCAGATTTGCATTGATTTACGTCATCATTGTGAATTAATATGCAAATAAAGTGAGTGAATATTCTCTGGAGGGTGTTTTGATTAAGTCAGCGCTATTGGTTCTGGAAGACGGAACCCAGTTTCACGGTCGGGCCATAGGGGCAACAGGTTCGGCGGTTGGGGAAGTCGTTTTCAATACTTCAATGACCGGTTATCAAGAA +>gi|49175990|ref|NC_000913.2|:3637673-3638149 +AGAAATAGCGCGCCATGTTAACAATGCAAACGACACATAAAGCCCAAAATAATGCGACGGTGCTTATCATACCTCCTCCCCGGCGACCTGCCCGCGGAGTTCCACCCCGGGGCTACCGCTCCCGATACGCTGCCAATCAGTTAACACCAGGTCCTGGAGAAACCGCTTTTGTGGTGACCAACATACGAGCGGCTCTATAGATAGTGTAGGAGATCAGGTTGTTTTTTTTCCAGAAGGTTAACCACTATCAATATATTCATGTCGAAAATTTGTTTATCTAACGAGTAAGCAAGGCGGATTGACGGATCATCCGGGTCGCTATAAGGTAAGGATGGTCTTAACACTGAATCTTTACGGCTGGGTTAGCCCCGCGCACGTAGTTCGCAGGACGCGGGTGACGTAACGGCACAAGAAACGCTAGCTGGCCAGTCATCGACAACTTTATGGAAGGAGTAACACTATGGCTTATAAACACA +>gi|49175990|ref|NC_000913.2|:3655531-3655825 +TGATATTTAGCGCCGCCAGCCTGTTCAGCTTCGCCAGTTTGTTCGTTAAGCAGCAATAATTACCCCGGTTGTCACCCGGATCATAGTCACTTGATGTGACTATGATCCGATTAATACTCTCTCCGCTACGCAGTGTTGTAGATCAATTGCGCACTATCATTGAAATAATTACCTGCTAGTGATTATTTCAACCTACTGAATTTCATCTAATTTTTTTCACTCTATGGCAAATTAGCCATTTCAAACATTATCATGGCTGATATTTTCCGTAGTCAGGTTTAATGTTTTAAAAGT +>gi|49175990|ref|NC_000913.2|:3655882-3656623 +AGTAATCCGGGTTCATTTTTTTGCAACTGGCGTTGATTACATTGCATAAATATCCGTGTCTCCAGACGCTATATAAAAACCTGAAGACATGAATGCGTTATTTACTCAGGTAATTTCAATGCGTTAAAAGAAAGCTGGCAATCCAATTGCCAGCTTAAGTCGAAACAAGGAGACTCGATATTTAAATCGGATTACATTTTAACTTTAGTAATATTCTTCAGAGATCACAAACTGGTTATTGATAACTTATTCTTGGGCAGTAATCCGCAAACGTTAACTTTTTGTTTGCTATTTACAAGCTGATAACAACCAGGAATCTTACTTAGGATCAATATATGGAGTGCGTGATGGATAAATCTGAAGTATTGATTAGTGTTAATAGACGTATTAGTTCACGAAGGGTAAAGTTCTTATAGGCGTTTACTATATTGAACAACGATTCGGACAAGGATGTAAATAATGAAAAGGATGACATATTCGAAACGATAACGGCTAAGGAGCAAGTTATGATTTTTCTCATGACGAAAGATTCTTTTCTTTTACAGGGCTTTTGGCAGTTGAAAGATAATCACGAAATGATAAAAATCAATTCCCTGTCAGAGATCAAAAAAGTAGGCAATAAACCCTTCAAGGTTATCATTGATACCTATCACAATCATATCCTTGATGAAGAAGCGATTAAATTTCTGGAGAAATTAGATGCCGAGAGAATTATTGTTTTGGCACCTTATCACATCAGTA +>gi|49175990|ref|NC_000913.2|:4380231-4380626 +GAGATTAGTGCGATTTTTGCATTCGGATTTGCCTGCGCGGCAGCAATTGCAGCACGTAATCCCGCGCCACCGGCGCCTACAATGGCAAGATCGGCTTGAAAGGTTTGCACGACATTCCTCCAGATTGTTTTTATCCCACAGCCACGTACTTCAGGGTAAGTACCTGAAAGTTACGGTCTGCGAACGCTATTCCACTGCTCCTTTATAGGTACAACAGTATAGTCTGATGGATAAGTCTGAAATTTGACGAGATCGATTTTTTTAGTGCGCAAGAGGGCTAAATTATCACTGAAGATGATTAATTTAATTACTAAACCATCAGATCGTGCTTTTTTTAGTCACTGCCCACCGCTGTTTGATTCCTGCGTAAAATTTGTCTCGCCGTCGCGTTGCGA +>gi|49175990|ref|NC_000913.2|:83211-83804 +GTGCATTGGCGAAGTGGCAATAAAGGTATGAATACGGAAGGCTTCGGCGACTTTCAGGGATTCGGCCGCCACGTCGATATCTTTTTCCACGCAGCGAGCTAACGCACATACGCGGCTGTTTTTAACCTGGCGGGCGATGGTTTGCACCGATTCAAAATCGCCCGGCGAAGAGACGGGGAAACCGACTTCCATCACGTCAACACCCATACGCTCAAGGGCCAGCGCAATTTGCAGTTTTTCTTTCACACTCAAGCTTGCCTGTAACGCCTGTTCACCGTCGCGCAATGTGGTATCGAAAATAATGACTTGCTGGCTCATGGTTTGGGTCCTTGTCTCTTTTAGAGCGCCTCGCTTCGGGCATAAAAAAACCCGCGCAATGGCGCGGGTTTTTTGTTTGACTGCGTGCTGGCTTAATGCTGGATGCCGCTCACTCGTCTACCGCGCAAAGAAGATGCGTTTAGTAGTAGTAGACCGATAAAGCGAACGATGTGAGTCATTAAATCAGCTCCAGATGAATGCGATATGCTTTTAGAGTTACTGGATACAAAAACGGATGTCAACCCTGACGCAATAAAAACGTCCCGCCAGCGTGA +>gi|49175990|ref|NC_000913.2|:3217042-3217446 +GTGGTATCGTCCGCCAGCGGGGTATTTTGCTGGGTGACATACGGATGAGAAGACATGTTCGCGTCCTGTGCAGGTTATATGGTTGTTAACTTCTTGTCAGAGTTTATGTCGGCCCCGCTGCGGTTATCTTTAACCGATTAATTTGATTTAGATCGCAATTTGCGATTTAAACACAAATCTAATTCCTTGATTTAAAATACTTTCACTCTGTTACTATACGAAAACGTTAATTATCTTGCCCAAAAATCAGGCAATTATTGCCCTGAAAACGTGCATTTGCGCAGCAATCATCAAATCCATACCCGACAAAAACCGTGCAAAATAACAACAAATGTTAACAGATAGCATTAAATATTGCACAAATGATAACCGAATTTGTGTTTATCCCGATTTTCGCGATCGCA +>gi|49175990|ref|NC_000913.2|:528089-528372 +TAGAAGATGATTATTTTTTTCTTATTTTTCCCTAACAAATGATGATGGGGGTAGGTTTAAGAAAACATTGACAGTGAGGGCCAGGGAACAAGTGGCGAAAATCGTATCAAAGAATGATCCAGATACAAAAAAAGTGTGGTGTAAATATGGTAAGATACCAGGGCAAGGGGATGGTGTAAACCTTTTTTTTGTTGGTGAAATTAATGTTACGCATTATTTTATAACAAATATTGGAGCTGGATTGCCTGATGCTTGTGCAGAGTAATTGCTTGAATTAAGAGTC +>gi|49175990|ref|NC_000913.2|:4563747-4564069 +AAAGGTGCAGTAAGTGCCAACCAGCGGAATGCCGCGCTCCTTGAGATCCATGACGGTGAGAAAGCCTGTCTGGCGAGCTTCAGAGAACTGATCGAAAATGGCGGGTAGATCGGTGACAAGTGACATGATTTTCCTTCCCCGTACCACGGGAGATAATGAAAAGAGGCCACATTATTACCATTCTTTTAGTATGGTTTATTTGATCTCTCTCGCGGTTAGCCACTTAGTTTTTCATGGATTTCTAAAAATTGCCGAGGGGGAACAAGGCCGGGAGCTGCGACAGCAAATATAAAATTAGCCCAATACAACCGCCAACCAGCGT +>gi|49175990|ref|NC_000913.2|:3273040-3273399 +CGAGAATAATAATCATTGTGCAAATGCTAATTTAATTAATACTATTTAAATATTATTTTGAGCATATGCACATAAGGTTGCGCGCTAAAGCACAGATTTGCGCTTTACCTTACCGGGCGGCACTGCAATCCCTGAAATGATTGACATTGATCACATTTCTGCGTTTAAACTCCTGACATTCTTATTTCACCCAATGAAGTCATTTATTTTTAAATGAGACCAGGTCCTCATTTTAATAACCCCTGGCTGGAGAATATTGCACAATGGCCAACATCGAAATCAGACAAGAAACGCCAACTGCGTTTTATATAAAAGTTCACGACACAGATAATGTGGCAATTATTGTTAATGATAATGGC +>gi|49175990|ref|NC_000913.2|:1715138-1715453 +GAGCGCGATCGGGCCTTGCGCCAGTCGCTCTGCTAACGCTTCCTCGTCCGTCACCTGGGCTACCAGCCCCCGCTCTTGCAATTGTTTAATCAAGTTACTGCTTGCCATCAAAATCTCCATGTATATAACGACTGCACCTTTGCCGGTACACGACTTTTCGCCAGATGCGAAAGAGACATAGAATAAAGTGCCAGAATCAGGAGTACCAGCGATTAAAGCAAGATTTTTGCATCTTTTCAGGGTGCAAGACGATCAATCTTCCACGCATCATTTTCACGCTGGTACAAAAAGCGGTCATGCAGGCGATGCTCACCA +>gi|49175990|ref|NC_000913.2|:4173431-4173958 +GAGCAACTGACTTGTAATCAGTAGGTCACCAGTTCGATTCCGGTAGTCGGCACCATCAAGTCCGGTGGGGTTCCCGAGCGGCCAAAGGGAGCAGACTGTAAATCTGCCGTCACAGACTTCGAAGGTTCGAATCCTTCCCCCACCACCAATTTCGGCCACGCGATGGCGTAGCCCGAGACGATAAGTTCGCTTACCGGCTCGAATAAAGAGAGCTTCTCTCGATATTCAGTGCAGAATGAAAATCAGGTAGCCGAGTTCCAGGATGCGGGCATCGTATAATGGCTATTACCTCAGCCTTCCAAGCTGATGATGCGGGTTCGATTCCCGCTGCCCGCTCCAAGATGTGCTGATATAGCTCAGTTGGTAGAGCGCACCCTTGGTAAGGGTGAGGTCGGCAGTTCGAATCTGCCTATCAGCACCACTTCTTTTCTCCTCCCTGTTTTTTCCTTCTGTTTATTGCATTCAACAAGTCGGGCATGTTGCCTGGTTGATGTGGTGATATCACCGATTTATCCGTGTCTTAGAGG +>gi|49175990|ref|NC_000913.2|:3653831-3654374 +ACCCTGCTCCTTTGCCGCGCATTTGCCTTTCCATGCCAATAGCGCCACAGGCAATTGCCGCGAGTATAAGACGTATAATAAATTCTGCTGTCATATAAAAATGCAAAAAGGAGCAGCAAGATGGCTCAACTTGCTACTCCTTTTTACTTGCACCTCATTAATTCGGCAAGTCATTAGATGCTTGATTTTTGAAGGTATACAAATTTTTCTGCGGGTTTTTCTTACAGTATTCGATCACTTTAGGAATTTGAGTGAGATCGGTTTCATTTAAAGTAACGGTATCGCCACCTTTATATACTGTTTCTTCATGCAGCATCCACCATGCAACCGGGGTCATTGCTTTTGGATTCAGATCAATAAATTCCTGGCAGGTCATATCTTTAGCGGATTCATTGGCTGCCAACGCAGATTGTGCGTTCACCAGTGACAAAGCCGCTACAGCGCCCATAAAAATAAACGCTTTACGGAGAGATGAAATATTCATTTTGTAACCCATTCAATATAGAGATTATATATGAACAGGAGGAATAGCTGCTTTTAAAG +>gi|49175990|ref|NC_000913.2|:3451073-3451516 +CAGACGCAAGTGAGTACGGATTTCGTACTGATCGCGCGCGTCTTTGTTGACGTGCGGGGAGATCAGAACAGTGAAGCGCTCTTTGCGTGTCGGCAGCGGGATCGGACCACGGACCTGCGCACCAGTGCGCTTGGCAGTCTCGACGATTTCCGCGGTTGCTTGATCGATCAGACGATGATCAAACGCTTTCAGGCGGATACGGATTCTTTGGTTCTGCATGAGACCAGAGCTCCAATTATTTTATAAACGAAAATGATTACTCCTCAGACCCATTACGATTGATGGGAGAGTGTAACCGTTCTTACGTAGCTCCCCGATTGGGAGCATTGTTAGGTAGCCAAATTCGGCTAACTGAGGTTCAGATTGAACCTGCTGTCAACTACGACAAGCCCGCGCATTATACATACTTAACCACCGAACGCAAGCGTATTGCTAGTAAAGTA +>gi|49175990|ref|NC_000913.2|:3265481-3265750 +TTTGTTGGAGAACAAACATTTATTTTATCAATATTTTAAAATTTCGAATACATGTATTGATCATCTCGAACAATTGATTAACGTCAACTTTTTCTCTTCTGACAGGACGTCATTTTGTGAATGCAATCGTTTTCCATAAATTCTTCTCCCCTCATAGGCGACGAATAGCATTTTGTGTTGAGGATCACAAAACGAATAATTGCTGATCGCCGCGATAAGGTCAGACAAAGACAACAAGGGAAATTTTCACAGAGCTTTTGATCGGCGTA +>gi|49175990|ref|NC_000913.2|:2816440-2816679 +GCGGTTATTACCGCCACCTCTTCCATACCTTCATCGCTGAAAGTACGTAGAATATGGTGCATCCGGGAGGATTCGAACCTCCGACCGCTCGGTTCGTAGCCGAGTACTCTATCCAGCTGAGCTACGGATGCAAATGGCGGTGAGGCGGGGATTCGAACCCCGGATGCAGCTTTTGACCGCATACTCCCTTAGCAGGGGAGCGCCTTCAGCCTCTCGGCCACCTCACCACACGCCTCTTA +>gi|49175990|ref|NC_000913.2|:3181381-3181799 +TCTGCCAGCTTCACGAAAGCCTTATTCTACCTTTTCGGCTGTCTCTTCTCTCGTACTGTTTAAGTATTTGTTCCGCGCTTCAAGCAGTGCGTCTCTGTCCTGGCGAAACGCCCGACAATAACGTTTCAAATGCAGGAAATAACCCACCACAACCAGAGCAATCAATACAATCCAGTACCAGGCGATAAACATCCTTTTACCCTTAATATTTAACAATACTAAACATTTAGCGTATAAATTTCACATATCCTTTTTCGGATATATTCATATGGTCGCATAATAACGGAACAATTATATGATTTTGATCATAATGAAAAAGAATGCCTGATATTAATGAAATTTTCTCATAGAAAGAGAAGCGGAATTAACATCTTGCATAGCAGGGAAAAATAAAAACCGAAGCCTGCTACAACTTCGG +>gi|49175990|ref|NC_000913.2|:2087866-2088270 +GCTAAGAGGACAGTCCTCTTAGCCCCCTCCTTTCCCCGCTCATTCATTAAACAAATCCATTGCCATAAAATATATAAAAAAGCCCTTGCTTTCTAACGTGAAAGTGGTTTAGGTTAAAAGACATCAGTTGAATAAACATTCACAGAGACTTTTATGACACGCGTTCAATTTAAACACCACCATCATCACCATCATCCTGACTAGTCTTTCAGGCGATGTGTGCTGGAAGACATTCAGATCTTCCAGTGGTGCATGAACGCATGAGAAAGCCCCCGGAAGATCACCTTCCGGGGGCTTTTTTATTGCGCGGTTGATAACGGTTCAGACAGGTTTAAAGAGGAATAACAAAATGACAGACAACACTCGTTTACGCATAGCTATGCAGAAATCCGGCCGTTTAAGTG +>gi|49175990|ref|NC_000913.2|:1433561-1433885 +CCTCTTCCTCAACGTGGCTAATCACGCGTTGAGTTAATTCTGAATCGGAAATATCGATAGGGACAAGAATCGTTCTGTTCATAAAACCTCCTGTTTTAGTATCCGCATAAAGTGTAACGCCAGATGACACTTTTTGTGTAATGACGGAGTTCACATTTTTAATTTAGATCAAAGGAGGAAGAATAAGCAGAAAAAGCCCGCCATAACAGCGGGCAGGAGGATTTAGAACTGATAAACCAGACCTAAAGCGACAATATCATCGGTAGAGATGCCATTGGCAGCGTAGAAGCTGTCATCTTCATCCAACAGGTTGATTTTATAGTC +>gi|49175990|ref|NC_000913.2|:1030680-1030959 +ATAATGATAATGCGTAAGGGCACCCAGAAGTTTTACCCATCTTTACGCATTTGATCTGGAACAGGTTTAACAGCGGATTATCAGGTCATTAAGCAAATATAACGCCCTGAGAATTTCGACAGGCAAAAGAAAAAGGGGTTAGCATTTAGCTAACCCCTTATCTTATTTGGCGGAAGCGCAGAGATTCGAACTCTGGAACCCTTTCGGGTCGCCGGTTTTCAAGACCGGTGCCTTCAACCGCTCGGCCACACTTCCGGAATGACGCGCACTATAAACATC +>gi|49175990|ref|NC_000913.2|:3654504-3655215 +AAGCCTGAACGATAGCTGGGGTTACGGTTGCAATACCCTGAACATCTAAAACCGCATCTTCTGGTTTATCTTTGTTGTTCAGCGCTTCAGCAAAACCAACTGCAGTTGGCTGGAAGGATTCGTCCACAGCCAGGAAATCTTCACAGGTCCAGGAGTTGACCGGTTTTTTGTTATCAGCTGCTTTTTGCGCATCCGCTGCATTGCTCACAACTGGCAGAAGAAGCAGACCACCAAGAATAACGCCTAATACTTTTTTCATCGTAATATCCTCAACTATAAAGTGAAAGAGCCGTCACGAATCAATTTCGACACTGAGGTTATAACCTGGTTTTCTGTATATGTCATGTTGATGGAAAATATCAAAATCAGATATTTTTATTTCAATACAATGAGTTACAGATGCATCAGATACTGCAATTAGGAAATTTTTATTAAATCGACTGCATTCTTAGACGCGTTTTTGGCATAGATTGATAGCAGGGGATTTTCTTCTTAATTTTATAGGGTGGTTCTATGTTATATATAGATAAGGCAACAATTTTGAAGTTTGATCTGGAGATGCTTAAAAAACATCGCAGAGCAATCCAGTTTATTGCCGTGCTGCTGTTTATCGTCGGGTTGCTGTGTATCAGTTTCCCGTTCGTCTCTGGCGATATTTTAAGCACAGTAGTGGGTGCATTATTAATCTGCTCGGGTATTGCGCTTATTGTC +>gi|49175990|ref|NC_000913.2|:3717791-3718143 +ACTTTTATCCACTTTATTGCTGTTTACGGTCCTGATGACAGGACCGTTTTCCAACCGATTAATCATAAATATGAAAAATAATTGTTGCATCACCCGCCAATGCGTGGCTTAATGCACATCAACGGTTTGACGTACAGACCATTAAAGCAGTGTAGTAAGGCAAGTCCCTTCAAGAGTTATCGTTGATACCCCTCGTAGTGCACATTCCTTTAACGCTTCAAAATCTGTAAAGCACGCCATATCGCCGAAAGGCACACTTAATTATTAAAGGTAATACACTATGTCCGGTAAAATGACTGGTATCGTAAAATGGTTCAACGCTGACAAAGGCTTCGGCTTCATCACTCCTGAC +>gi|49175990|ref|NC_000913.2|:696051-696422 +CAATAACCGGGCGGTGAACCGCTTACTCGAAGAAGATGGCTGGGGTACCTGGATTCGAACCAGGGAATGCCGGTATCAAAAACCGGTGCCTTACCGCTTGGCGATACCCCATCCGTACAACGCTTTCTGGTGAATGGTGCGGGAGGCGAGACTTGAACTCGCACACCTTGCGGCGCCAGAACCTAAATCTGGTGCGTCTACCAATTTCGCCACTCCCGCAAAAAAAGATGGTGGCTACGACGGGATTCGAACCTGTGACCCCATCATTATGAGTGATGTGCTCTAACCAACTGAGCTACGTAGCCATCTTTTTTTTCGCGATACCTTATCGGCGTTGCGGGGCGCATTATGCGTATAGAGCCTTGCAGCGT +>gi|49175990|ref|NC_000913.2|:4584709-4584981 +TTCCGCCGCACAGGCGATGGCATAAGTGAAGTCGTTGACGCCCTTCAGGAATTCAAAATTGGATTTATTCATCATTGTTATTAATCCATTGCTGTGCGGGCCTGTCCAAATATTTAAGGCCCATAACATCTCATCTTAGCTTTCTGTACCTTTCCGGGCAATGACCACGGTCACAGCAACTGACTCATTTCTAACGTGTTCGTCTATTTTTGTAGTGCTATAGTAGCCGAAAAACATCTACCTGATTCTGCAAGGATGTACTATGACGGTTC +>gi|49175990|ref|NC_000913.2|:3208663-3209135 +CACCCTTATAAAAGTCCCTTTCAAAAAAGGCCGCGGTGCTTTACAAAGCAGCAGCAATTGCAGTAAAATTCCGCACCATTTTGAAATAAGCTGGCGTTGATGCCAGCGGCAAACCGAATTAATCAAAGGTGAGAGGCACATGCCGGTAATTAAAGTACGTGAAAACGAGCCGTTCGACGTAGCTCTGCGTCGCTTCAAGCGTTCCTGCGAAAAAGCAGGTGTTCTGGCGGAAGTTCGTCGTCGTGAGTTCTATGAAAAACCGACTACCGAACGTAAGCGCGCTAAAGCTTCTGCAGTGAAACGTCACGCGAAGAAACTGGCTCGCGAAAACGCACGCCGCACTCGTCTGTACTAATTCCCCGAGAGCGTTGCTCTCCGATCAGACCGAGTTGTAGTTGTAAGGCCGTGCTTCCGAAAGGAATGCGCGGCTTATTTTCGTTTATGAATTGCTAAAAATCGGGGCCTATGGCTG +>gi|49175990|ref|NC_000913.2|:3319994-3320693 +GAGTAAAGCAACTGGACGAGATACAGATACCTGACAACCATTCCTCAGACCAGGACCAAAACGAAAAAAGACGCTTTTCAGCGTCTCTTTTCTGGAATTTGGTACCGAGGACGGGACTTGAACCCGTAAGCCCTATTGGGCACTACCACCTCAAGGTAGCGTGTCTACCAATTCCACCACCTCGGCACGGATACTACTTTTTAGTTCGGGATATCGCTGGTCGGCTTAGCCGGAGCAGCTGGCTGAGTTTGTTCGGTTTTCGCCGGTGCACTCAGATTTTCCCATTCGCTACCTTTATTGGTTTTGTTGCTATTGATGTTACCCAGCACCAGACTGATGATGAAGAATAACGTTGCCAGCAGCGCCGTCATGCGGGTCATGAAGTTACCAGAACCACTTGAACCAAACAGCGTAGCGGAAGCGCCTGCTCCGAAGGAGGCTCCCATATCAGCGCCTTTACCTTGCTGCAGCATGATCAGACCAACAAGGCCAATTGCCACAATAAGGAAAACTACTAAAAGAGCTTCATACATAATCAACCTGTTCCTTGCGGAGTTGCCGCGTACCAATGCTTCAACCAATAAAGCGGGAGTTTTTAATCTTTCCCACTGAAGCGGGTGTGAATACTAACCAAAGCGAATGACCTTCGCAAGGGCAATTTTATCGCATTGTATCAACTGCGGAAAAAAACAGCAAAAC +>gi|49175990|ref|NC_000913.2|:2922070-2922516 +TGATGTTCACGCAGTAACGCTTCAAGCGCCTGCAACTGGAGACGAACGCGGTCATGAGTGGTCATAAAGCAACCTCAATAAGAAAAACTGCCGCGAAGGATAGCAGGAATAAAAAAAGGGAGCACTGTATTCACAGCGCTCCCGGTTCGTTTCGCAGCATTCCAGCTACTTTTGTTGCTCCCTGCTCATCCTTGACAACTTTTCCTCTGGCCTTGCGGCCAATCGTTCATCCTGAACTATTGCTTCCTGCTCACACCACCCCGATGTGATACTTCATCCTGAAGTGTCCCTGGCCTTCCTGACCCACCGAATCATCCTGACCGGTTCTCATTCTCCATCCTGGAGGTGTCCTTTAACGCGTCCTGCGTCATCCTCTTCGCTTCATCCAGAAGCCTTTCCCTGAAACACCATCCTGGTGTGTCCTGCAGAAGTGTCATCATCCTGAT +>gi|49175990|ref|NC_000913.2|:2263970-2264309 +CAGAATACTTAAGCCCGGGCGAAAAAATTCGTATCCATATCGAAGAACGCCGTTATATGGGGCGTGCTGACTAACTTCAGCCGCATGCAGAAAAGGGATAGCTCAGGCTGTCCCTTTTTTAATTTATTATACCAATCTTCTATTTTGCGCTTCCTGAACACCTTACGCCCTCAATTTTCACTCGTTGATTGATCGCCCTCACACTTCATCGCATTAACAATCCAGACCAATTTCAATTGCTGTCATATAACTTTACACTGTCGTTGTTAATTAATCGTTACTAAGACGTGACTCCTATGAATACAATCGCCTCCGTTACGCTCCCGCATCATGTACACG +>gi|49175990|ref|NC_000913.2|:262004-262259 +GCCATTTGATTCACAAGGCCATTGACGCATCGCCCGGTTAGTTTTAACCTTGTCCACCGTGATTCACGTTCGTGAACATGTCCTTTCAGGGCCGATATAGCTCAGTTGGTAGAGCAGCGCATTCGTAATGCGAAGGTCGTAGGTTCGACTCCTATTATCGGCACCATTAAAATCAAATTGTTACGTAAGATCTTATCATTCTCCCACCAAAAAATTATCTTAATGTAACAGCTGGTGTAAGTAAATTCTATCAAC +>gi|49175990|ref|NC_000913.2|:746918-747181 +AGATGTACCGCGAGGAAACCAATAGAAAGGAGAAGTGATAGCACACATAACATTCCCCACTGCAAAACTGGCATCCTTCGCTCCCTCATTATTTTCTCTTTGGTGTATCGATACAGCATAAAGAAAGCTGGGCCAGGAAAAAATAACAGCAATCAAGGAAAAGGGGAAAATCAGCAATTTTCTGAAAGAGATGCCCTTTCCGGCGGCAAAGGGCATCATGGTAAATCAGTTATAGTTAATCTTAAAAAGCACCACTGTATCGA +>gi|49175990|ref|NC_000913.2|:1184881-1185222 +AGGGCGGTAATTCTACGGCAAACCGCTTGAATCGCCAATCTTTGTTGTGAATTACTGGCTTAGCTTTATATTCATTAAGGTAATGCTGATAAATATTCCCGCTTGCAGGGGTAAAAGTGACCTGACGCAATATTTGTCTTTTCTTGCTTCTTAATAATGTTGTCACAAAAAGTGAGGGTGACTACATGGATAAACTACTTGAGCGATTTTTGAACTACGTGTCTCTGGATACCCAATCAAAAGCAGGGGTGAGACAGGTTCCCAGCACGGAAGGCCAATGGAAGTTATTGCATCTGCTGAAAGAGCAGCTCGAAGAGATGGGGCTTATCAATGTGACCTTA +>gi|49175990|ref|NC_000913.2|:2561309-2561602 +AATCCCCGCTGACATTATATCACCCATCTTTCCGTTTTCAGGTGGGAGACTGATGGCCGCCGACGTTGCACAATTGATTAATTTCTGAACATTCAAATCAACTCCCACCAAAGCCAACAAGATGGGAGGTGAATTTTGTGATGCAGATCGCTTTTTCCCTTCAGGCATTTTTGTTTAATTCAATCAACGAAAGGCAGTTCCCAACAACAAAATAAAGATGGGAGGTAAAAATGAAACGGTGGGAAGTGGCTTTACTGGTGATGGTAATGTTGGTTTGCTGTATTGAACTGTGA +>gi|49175990|ref|NC_000913.2|:3739510-3739802 +CAATCCTGATTCTCATGATGCGACACTGCGCAAGCCACTTCACAGGTACGGCAACCGATACATTTCGTCGCATCCGCAATAATAAACCGGTTCATGATCACTCCTTTGCACTAAAACAGCAGAGTGCCAGAGAGCGCGTCCCTACTGCTTTGATCCAACGAAGGAAATGTTAAAGTTTTGGCACTTGTCCAAAGCGTTATGTGATCGCGCCCAGGTCATCCCTGACAGTAAATCCCAGTTCATTAGAAATAGCCTGCGCGGTTTCACGCAGTGGTTTCAGGAGATTTTTCTC +>gi|49175990|ref|NC_000913.2|:2341371-2341736 +TGTTCTCATCGACTAACACGCCTCCGTTCTGGCTGTAGCTGTCATCAACAGAAATATCAATAAGATAGGGAGCCGTCACGCTATTGGTATATATCGCCCCGCCATAACCTTCGGCAGTGTTATTTGTAAAGGCAGTGTTATTTACTATCGTATAACCTGAAGGATGCTTGCTGTCGCTATTATTATCGGTAACATCGATTGCCCCGCCATCGCCATCACTGTAACTTGTTGATGTATATGCCTGGTTGTTATCAAAAATAACATCACTTAAATAAACGTCATTATTAATGGTATAAATTGCGCCACCTTTGCCATCATTAGCGATGTTATTGCGAAACATGGCGTTAGTGACACGTAAATCGACG +>gi|49175990|ref|NC_000913.2|:3305870-3306784 +GAATTCAGCCATGTAGTACGTGTGCCTCAAAATTAATGGCGGCCAGTCTACATAACTCATCATGAAATTGATCAGCAATTTTCATTGAAAAGTGTGAACCGGCTCAAAGTAGGTGTATTAACGAACAACAACGCCCTCACCCGTTAAGGTGATGGCAATCAAAAAAGATTACGGGCTGATGTGTACGTCAGCTATTGCTGGTCCGATTCTGCCAGGTCATCTTGGTCCTGGCCCAGGAGCGATAATTCCAACAATGCGTATCGGTGCTCAACAAAGTTATGAACGTTGTTGGCAACCGCCAGTTTGAACAGTGCCGTGGCGCTGTCCAAATCCCCCAGACTTAGGTAGTACTTACCTAAATAGAAGTTGGTTTCACTGAGATGCTCAGCGAGCGAGGTGTTATCCGTTGCGTCCGCCTTGAGCCTTTCCATTAACGTTTGTTCGCTAATGTTGCCCAGGTAGAACTCGACAATGTTCCATCCCCACTGTTCCTTATCCGATTTTTCGAAGTGCTGTTTCAACACTTCTTTAGCCTGCTTCTCATCGAGCTTCTGCTCGGCGAGATAAAGCCACAGACTACGGAAAGGATCATTGGGATCGTCTTGATAAAACGCCAGCAGATCATCTTGCGCTAACTTGTCACGACCGCCGTAATATAATGCGATCCCGCGATTCAAGTGCGCGTAGTTGTAAGTTGGATCAAGCTCAAGTACAGAATCAAACGCTTCATAGGCAGCATCAAAATTGCCTGCCTGCGTTAAATATATGCCTAAGTAATTGAATACTTCAGGCATATCCGGTCGGATTGCCAGCGCTTGCGAAAAATCGTTACGCGCTAATGCCCTCAGACCGAGACTATCATACAACACTCCGCGCTCATATAAAAGCTGTGCGCGTTCGTCATCGGTTAAAGC +>gi|49175990|ref|NC_000913.2|:2734847-2735094 +GAATGCGCAAGCTGAAAAAGTAGCGAAAATCATCGCCGCAAACAGCAGCAATACATAACAGAAACCTGAAACACAAAACGGCAGCCCTTGAGCTGCCGTTTTTTTATTCTGTCAGTTGTGAAACTGAAGCGATTTAGTCGCTATCGATCTCATCAAATATGGCTCGCTTTGAGATATTCCTCAAGTAAAAAAACATCTCTTCCTGCGATTTCTCACAAAAAAGATTCGTTGACAAAAAGTGACAAAA +>gi|49175990|ref|NC_000913.2|:1031117-1031819 +AAAAGAAAAATAGGTCCATTTTTATCGCTAAAAGATAAATCCACACAGTTTGTATTGTTTTGTGCAAAAGTTTCACTACGCTTTATTAACAATACTTTCTGGCGACGTGCGCCAGTGCAGAAGGATGAGCTTTCGTTTTCAGCATCTCACGTGAAGCGATGGTTTGCCTTGCTACAGGGACGTCGCTTGCCGACCATAAGCGCCCGGTGTCCTGCCGGTGTCGCAAGGAGGAGAGACGTGCGATATGAATAACGAGGAAACATTTTACCAGGCCATGCGGCGTCAGGGCGTTACCCGGCGCAGCTTTCTCAAATATTGTAGTCTGGCTGCCACGTCGCTGGGATTAGGCGCGGGAATGGCACCAAAGATTGCCTGGGCGCTGGAGAACAAACCGCGCATTCCGGTGGTATGGATCCACGGTCTGGAATGCACCTGCTGTACCGAATCTTTTATCCGCTCCGCTCACCCACTGGCGAAGGACGTCATCCTTTCCCTGATTTCCCTCGATTACGACGATACTTTGATGGCTGCCGCCGGAACCCAGGCGGAAGAAGTCTTTGAAGACATCATCACGCAATACAATGGCAAATATATCCTCGCAGTAGAAGGTAATCCGCCGCTGGGCGAGCAGGGGATGTTCTGTATCAGCAGCGGTCGACCGTTTATTGAGAAACTCAAACGTGCCGCTGCCGGAGCCAGCGC +>gi|49175990|ref|NC_000913.2|:236875-237105 +GACGAAGGTCGAGGCAATCCGTAATATTCGCCTCGTTCCCAACGGAACACAACGCGGAGCGGTAGTTCAGTCGGTTAGAATACCTGCCTGTCACGCAGGGGGTCGCGGGTTCGAGTCCCGTCCGTTCCGCCACTATTCACTCATGAAAATGAGTTCAGAGAGCCGCAAGATTTTTAATTTTGCGGTTTTTTTGTATTTGAATTCCACCATTTCTCTGTTCAATGATTTTA +>gi|49175990|ref|NC_000913.2|:2057821-2058054 +AAATCGGGTTTGACAAAAGATTTTTCGCCGTTAAGATGTGCCTCAACAACGATTCCTCTGTAGTTCAGTCGGTAGAACGGCGGACTGTTAATCCGTATGTCACTGGTTCGAGTCCAGTCAGAGGAGCCAAATTCCTGAAAAGCCCGCTTTTATAGCGGGATTTTTGCTATATCTGATAATCAATTTCCTCTTCACTGCTTTCCATCACCTGCCGCTTGATATCCTCAACTGAC +>gi|49175990|ref|NC_000913.2|:960806-961371 +CGATGGCCGCGACATGGGAACGGTGGTATTCCCTGATGCACCAGTGAAAATTTTCCTTGACGCCTCCTCGGAAGAACGTGCGCATCGCCGCATGCTACAGTTGCAGGAGAAGGGCTTTAGTGTTAACTTTGAGCGCCTTTTGGCCGAGATCAAAGAACGCGACGACCGCGATCGTAACCGAGCGGTAGCGCCACTGGTTCCGGCAGCCGATGCTTTAGTGTTGGATTCCACCACCTTAAGCATTGAGCAAGTGATTGAAAAAGCGCTACAATACGCGCGCCAGAAATTGGCTCTCGCATAAGCGACCGAATTTGCAGTACCCCCGTTGCAATGGAATGACAGCGGGTATGTTAAACAACCCCATCCGGCATGGAGCCAGGTGGACGTTAAATATAAACCTGAAGATTAAACATGACTGAATCTTTTGCTCAACTCTTTGAAGAGTCCTTAAAAGAAATCGAAACCCGCCCGGGTTCTATCGTTCGTGGCGTTGTTGTTGCTATCGACAAAGACGTAGTACTGGTTGACGCTGGTCTGAAATCTGAGTCCGCCATCCCGGCTGAGC +>gi|49175990|ref|NC_000913.2|:1777032-1777311 +CCGTACAGCGCACAGGTACACGTCGTTGCCGGATCGTTAACGTCAGAGGCCACCGCATGCAGCCTGCCGCGTTCTATCACCACAATGTCATCCGCATGTGCGACATATAACGAAGAATCGATGGTAAACCTTGCAACCCCTTTCTTGACGTAAATCAACTCGGTTTCATTGTCATGAACGTGATGTCCGGACTCCCATTTTGGATCATCGCTAAATGCAAAACGTGAAAGCCGTGGCGTTTTACCGGCGACAAACAGCGTTTCACTGGCGTTATCAAAA +>gi|49175990|ref|NC_000913.2|:2904797-2905068 +GCAGCAGTACCAACAGCCAGGTCAGCGATGGTAGCGTCTTCAGTTTCGCCAGAACGGTGAGAGATAACTGCAGTGTAGCCAGCATCTTTCGCCATCTTGATTGCAGCCAGAGTTTCGGTCAGAGAACCGATCTGGTTGAATTTGATCAGGATGGAGTTAGCGATACCTTTTTCGATACCTTCTTTCAGGATCTTGGTGTTGGTTACGAACAGGTCGTCACCAACCAGCTGGATTTTGTCGCCCAGAACTTTGGTCTGGTATGCGAAACCGT +>gi|49175990|ref|NC_000913.2|:4176236-4176652 +CTGAAAGTGTCTGTTTCTATCTTCGGTCGTGCGACCCCGGTAGAGCTGGACTTCAGCCAGGTTGAAAAAGCCTAACCCAGCGATCAAAAAAGCGGCGATTTAATCGTTGCACAAGGCGTGAGATTGGAATACAATTTCGCGCCTTTTGTTTTTATGGGCCTTGCCCGTAAAACGATTTTTTATATCACGGGGAGCCTCTCAGAGGCGTTATTACCCAACTTGAGGAATTTATAATGGCTAAGAAAGTACAAGCCTATGTCAAGCTGCAGGTTGCAGCTGGTATGGCTAACCCGAGTCCGCCAGTAGGTCCGGCTCTGGGTCAGCAGGGCGTAAACATCATGGAATTCTGCAAAGCGTTCAACGCAAAAACTGATTCCATCGAAAAAGGTCTGCCGATTCCGGTAGTAATCACCGTT +>gi|49175990|ref|NC_000913.2|:2753452-2754110 +CGCGAATGAACATCTTATTGGCTATCACATCCGACACAAATGTTGCCATCCCATTGCTTAATCGAATAAAAATCAGGCTACATGGGTGCTAAATCTTTAACGATAACGCCATTGAGGCTGGTCATGGCGCTCATAAATCTGGTATACTTACCTTTACACATTGGGGCTGATTCTGGATTCGACGGGATTTGCGAAACCCAAGGTGCATGCCGAGGGGCGGTTGGCCTCGTAAAAAGCCGCAAAAAATAGTCGCAAACGACGAAAACTACGCTTTAGCAGCTTAATAACCTGCTTAGAGCCCTCTCTCCCTAGCCTCCGCTCTTAGGACGGGGATCAAGAGAGGTCAAACCCAAAAGAGATCGCGTGGAAGCCCTGCCTGGGGTTGAAGCGTTAAAACTTAATCAGGCTAGTTTGTTAGTGGCGTGTCCGTCCGCAGCTGGCAAGCGAATGTAAAGACTGACTAAGCATGTAGTACCGAGGATGTAGGAATTTCGGACGCGGGTTCAACTCCCGCCAGCTCCACCAAAATTCTCCATCGGTGATTACCAGAGTCATCCGATGAAGTCCTAAGAGCCCGCACGGCGCAAGCCCTGCGGGCTTTTTTGTGCCCTCAATTTGTCCCGCGAAGTCCGAAGAGAACTAATTAAATCCGAACCTT +>gi|49175990|ref|NC_000913.2|:3054001-3054292 +AAAATTTCTCTGAGATGTTCGCAAGCGGGCCAGTCCCCTGAGCCGATATTTCATACCACAAGAATGTGGCGCTCCGCGGTTGGTGAGCATGCTCGGTCCGTCCGAGAAGCCTTAAAACTGCGACGACACATTCACCTTGAACCAAGGGTTCAAGGGTTACAGCCTGCGGCGGCATCTCGGAGATTCCCTTCTTATCTGGCACCAGCCATGACGCAACTACCAGAACTCCCACTGACATTATCCCGACAAGAAATCCGCAAAATGATTCGGCAACGTCGTCGTGCGTTAACG +>gi|49175990|ref|NC_000913.2|:4177543-4178432 +TCCACCACCATGGGTGCAGGTGTTGCAGTTGACCAGGCTGGCCTGAGCGCTTCTGTAAACTAATGCCTTTACGTGGGCGGTGATTTTGTCTACAATCTTACCCCCACGTATAATGCTTAATGCAGACGTATATCCGAGATATTCGGGTTGTGGCAAGGCGGCAACTGAGTGAGTCGCCAGGAGCATAGCTAACTATGTGACTGGTGCGAATGAAGGAAGCCAACGCCGTCACAAGCTGAATAGCGACGGATAGAAAAGATTTGTTCGTTGGAGCCTGGCCTATCCAGGCCTCCGTCGAAGACCGCAGGAGTTTCGCAAGAAACTTAATCCCCTGCGTAGACGGTGACAGAACGCTAAGATTATTCTTTTATATTCTGGCTTGTTTCTGCTCACCGTAATTAAGACGCTCTCTCCGTTTGGAGGAGTGAAGTGAGTTCCAGAGATTTTCTCTGGCAAACATCCAGGAGCAAAGCTAATGGCTTTAAATCTTCAAGACAAACAAGCGATTGTTGCTGAAGTCAGCGAAGTAGCCAAAGGCGCGCTGTCTGCAGTAGTTGCGGATTCCCGTGGCGTAACTGTAGATAAAATGACTGAACTGCGTAAAGCAGGTCGCGAAGCTGGCGTATACATGCGTGTTGTTCGTAACACCCTGCTGCGCCGTGCTGTTGAAGGTACTCCGTTCGAGTGCCTGAAAGACGCGTTTGTTGGTCCGACCCTGATTGCATACTCTATGGAACACCCGGGCGCTGCTGCTCGTCTGTTCAAAGAGTTCGCGAAAGCGAATGCAAAATTTGAGGTCAAAGCCGCTGCCTTTGAAGGTGAGCTGATCCCGGCGTCTCAGATCGACCGCCTGGCAACTCTGCCGACCTACGAAGAAGCAATTGCACGC +>gi|49175990|ref|NC_000913.2|:2689043-2689307 +AGGGGCAGCAGAATCAGCAAAAATGCCAGCATTACCAGTTGTCGTAATGAGCGGGGAAAAACGGGCCAGCGTTTCAAGGTGTTACTCTCGTCAGACGCGAATAGCCTGATGCTAACCGAGGGGAAGTTCAGATACAACAAAGCCGGGAATTACCCGGCTTTGTTATGGAATAAGGCGGTGCCTAACTCGACGTTTCGCCCGATGGTTGATATAGCTACGCTGATATCAGAAGTTGGACGGCAGGCACCTTGTTGTGCGTCATTC +>gi|49175990|ref|NC_000913.2|:20750-21323 +AAGCAGCAGGCATAAAAAAACCCGCTTGCGCGGGCTTTTTCACAAAGCTTCAGCAAATTGGCGATTAAGCCAGTTTGTTGATCTGTGCAGTCAGGTTAGCCTTATGACGTGCAGCTTTGTTTTTGTGGATCAGACCTTTAGCAGCCTGACGGTCCACGATCGGTTGCATTTCGTTAAATGCTTTCTGTGCAGCAGCTTTGTCGCCAGCTTCGATAGCTGCGTATACTTTCTTGATGAAAGTACGCATCATAGAGCGACGGCTTGCGTTGTGCTTACGAGCCTTTTCAGACTGAATGGCGCGCTTCTTAGCTGATTTGATATTAGCCAAGGTCCAACTCCCAAATGTGTTCTATATGGACAATTCAAAGGCCGAGGAATATGCCCTTTTAGCCTTCTTTTGTCAATGGATTTGTGCAAATAAGCGCCGTTAATGTGCCGGCACTCGTTACGTAGTGATGGCGCAGGATTCTACCAGCTTGCGGGGTGTGAATACAGCTTTTCCGCGATAAAAATTGCAGCAGGCGGTCAGTTTCTTCCCGTGATTTGCGCCATGGCAATGAAAAGCCACTTCTT +>gi|49175990|ref|NC_000913.2|:3408085-3408483 +GAAAAATTGAGAACTTACTCAAATTTCTTTGAGTGTAAATTTTAGTCACTATTTTCTAATATGATGATTTTTATGAGTAATTATCGCACCACGCTCATTTTAAATGCAATTCTTTGATCCATCTCAGAGGATTGGTCAAAGTTTGGCCTTTCATCTCGTGCAAAAAATGCGTAATATACGCCGCCTTGCAGTCACAGTATGGTCATTTCTTAACTCATGCGCATCGGACAATATCAGCTCAGAAATCGCCTGATCGCAGCGCCCATGGCTGGCATTACAGACAGACCTTTTCGGACGTTGTGCTACGAGATGGGAGCCGGATTGACAGTATCCGAGATGATGTCTTCTAACCCACAGGTTTGGGAAAGCGACAAATCTCGTTTACGGATGGTGCACAT +>gi|49175990|ref|NC_000913.2|:1755463-1755736 +TACTGGGCGCGGTAATCCTGGGTTCTACTCTGCTGGCAGGTTGCTCCAGCAACGCTAAAATCGATCAGCTGTCTTCTGACGTTCAGACTCTGAACGCTAAAGTTGACCAGCTGAGCAACGACGTGAACGCAATGCGTTCCGACGTTCAGGCTGCTAAAGATGACGCAGCTCGTGCTAACCAGCGTCTGGACAACATGGCTACTAAATACCGCAAGTAATAGTACCTGTGAAGTGAAAAATGGCGCACATTGTGCGCCATTTTTTTTGTCTGCC +>gi|49175990|ref|NC_000913.2|:3376330-3376928 +CCAGCGGGCCTTTTGGCAACATGCCTTTAACCGCGATTTCAATCACACGCTCAGGACGGCGAGCAATCATCTCTTCAAAGGTCGCTTGTTTGATACCACCGATGTGGCCGGTGTGGTGATAGTACACTTTGTCAGTACGCTTGTTGCCGGTTACAGCAACTTTGTCAGCGTTCAGAACGATGATGTAATCACCGGTATCTACGTGCGGAGTGTATTCCGCTTTGTGCTTACCGCGCAGGCGACGAGCCAGTTCAGTAGCCAGACGGCCCAGAGTTTTACCGGTCGCGTCAACAACATACCAGTCGCGTTTTACGGTTTCTGGTTTAGCTGTAAAAGTTTTCATTAAAAGCTTACCCAATAAATAGTTACACGTTGGTGAACACCCAAACGTCTTCAATTGTTGAGGTTCACACGACAAAGTCCGGCAAACCTACCCCTTCGAATAGCCTATGCCAGCACACAAAAAGTTTTGGGAAAAAAACTTTCTTGTAACGTGGGGTCGCAGGATTATAGAGAAGTCGGGGTCAAAGATCGACCCCTTTTTGTGATTTGTGACAGGTTTTAACCCGCCAAATGCTCGCGCTTCAGATACTCTTCG +>gi|49175990|ref|NC_000913.2|:4614980-4615298 +TTCTGCTTCCAGTGCCAGAAAATGGCGCTTCTGCTCCGGGCTAAGCACTGGGCTGGTGACAATTTGCTGGCAACGTTGTTGCAGTGCATTTTCATGAGAAGTGGGCATCTTCTTTTCCTTTTATGCCGAAGGTGATGCGCCATTGTAAGAAGTTTCGTGATGTTCACTTTGATCCTGATGCGTTTGCCACCACTGACGCATTCATTTGAAAGTGAATTATTTGAACCAGATCGCATTACAGTGATGCAAACTTGTAAGTAGATTTCCTTAATTGTGATGTGTATCGAAGTGTGTTGCGGAGTAGATGTTAGAATACTA +>gi|49175990|ref|NC_000913.2|:1298536-1299319 +CAGTTATCAAAAGCAAAAGGAATAGGTAAAAATATTCTTCTCAAATTACAGTTAGTTATAAGGATTTCCTTAACTGCTTCTCCTCACCATCATGTTATTTTCGCCACATCATAATCCTGGGCTTGCTGAAGAATAATTGAAATGATATTATTAATTCCACTGCCTTTGGTAGAGGAAAGTGCTAAATAATAATCAATTGTTAAATTATTGTGCATTTCACTACTGGAACTGTAATCAGAAAAGATAGACATGCTTAGCCAATCTCTATTTGATTGAATTGAAAGATGTTTGTTAAGGCATGGATGCAAGCTATAGATTCTGATACGGTCAATAAAAGAGAATTGCTTAACAATTTTGCAAAATGTATTGGCGAGTAAGAACCGCATTTGGTACTTTCCGGGCAACCGCCAGACGATTCTTTATTGGTAATGAGAATAATTAACAATTAAAGAGCGTCGCGAAAGAATAATGTGTCTCGACAGGGGAGACACAGTACGAATCGACATAAGGTGATCGTCTGAATCACCAGAATAAATAAAGTCGGTGATAGTAATACGTAACGATAAAGTAACCTGACAGCAGAAAGTCTCCGAGCCTGTGCAGGGTCCCAATCCGGGATTACACATGCTGGTTAATACCAGTAATTATAATGAGGGAGTCCAAAAAACAATGACCAACATCACCAAGAGAAGTTTAGTAGCAGCTGGCGTTCTGGCTGCGCTAATGGCAGGGAATGTCGCGCTGGCAGCTGATGTACCCGCAGGCGTCACACTGGCGGAAAAA +>gi|49175990|ref|NC_000913.2|:3445839-3446394 +TTCTCACTACGAAGCTCACGAGTTACCGGCCCAAAAATACGCGTACCGATAGGCTGCTCGCTGTTGTTGTTCAGAAGAACACAAGCATTACCATCGAAGCGAATGACAGAACCGTCCGGGCGACGAACACCCTTCTTGGTGCGCACCACTACCGCCTTCAGCACATCACCTTTTTTGACCTTACCACGCGGAATTGCTTCTTTGATGGTGATCTTGATGATGTCGCCTACGCCTGCGTAGCGACGGTGCGAGCCACCCAGAACCTTGATACACATTACGCGACGTGCACCGGAGTTGTCGGCGACGTTCAGCATAGTCTGTTCTTGGATCATTTTAGTGCTCCGCTAATGTCAACTACTACTGAGACCCGAAAATCAGGTCGTTAAAAATCCCCATATCGAGGGCGCGGCATTATAACACCGCTTCAAGGATATGGGTAGAAAAAATAAACGGCTCATTTCTGAGCCGTTTATTCGTATTGAGAGAGTGTACTGTATTACAGAACCGCTTTCTCTACAACGCGAACCAGCGTCCAGGATTTAGTCTTGGACAGCG +>gi|49175990|ref|NC_000913.2|:4470356-4470629 +AGGTCATCGCGACTAAGGTCGTTTATGGAAATGATATGTTTCTGATATAGCGGATTAGCCATCTTTTATCTCCTGACGCCTGGGCAAAAAAAAGCCCCTCGATTGAGGGGCTGGGAATGGGTGATCAACGGGAAGAAAAACGGCAGGCCAGCGTCTTTTTTCAGACGCGGTAAGACAAAATGTCGAACACACTGAACCATACATCCTCCCGGCAAATTGTCCGGCATTATACTCATCGTCAGAAGCGGCGCAAGCATTTGATGCAATATTTTG +>gi|49175990|ref|NC_000913.2|:915311-915530 +GTTTTATAGGGTCGTCGTAAAAAAGTGACGACGGAAATAATGCGCGGCTATTTTAAAAACGAAGGCGAGTCATTCACCAGATAAATAAATCCAGTAAATTTGATTTAGGGCAACAGCGGGTTGCCCCATATAGTCATTTGTCTGATTGACAGTGTAGTGCACGCAAAAGATTTAATCCTTTAGGCGTAATAAAAAATAATTTATCATGCTAATTATTTG +>gi|49175990|ref|NC_000913.2|:3809551-3809914 +AGGGTGACAAAACGCTTCTCGCTCTCAACCCAGAAACGGTGAGAGTGCAGGTTCGGCAGGAAACGGCGTTTAGTCGCGTTCAGTGCGTGGGAACGGTTGTTACCGGTCACCGGACGCTTGCCAGTAACTTGGCAGACTCGGGACATGTCTATTCTCCAAAAATCAAATTAGCTCGAGCTTCGTATGGGGTATTGGCGCCTCGTCAGGCTTTACAGCCCGGTCATCGCAGTTCTATGTGAACTCTCGATTGCCAGGCCCAAATGCCAAACCCGAGATTCTCAAAGGTGGCGTAGTATACGCTGACTCAGCGATGTGCTCAAGTCCCGAACAGACAAAGATCCCGAAGGATCGCGCATAGCGGGT +>gi|49175990|ref|NC_000913.2|:2744080-2744389 +CGATTGGGTTGAAGAAACCAACGCGCTCGATGAAGCGACCGTTGCGTGCATTACGGCTGTCAGCGACAACAACCTGGTAGAACGGACGCTTTTTAGCGCCGTGACGTGCTAAACGAATAGTTACCATAACATCCTCTTGTGTGAATAAAACAACCGGACCCCATCGAGGAACGGAGTCCGGTGTCATATTAAAAGCCCGAAAATTTTACTCATTTTTGCGGGAATTGCAATCAACAGTTGCTAACTCTGCTGTAAAAGGCCGTCGGCGGTGCAGCCAGTTTGGTGCCGGAGTGCGCGCAGTCACCGGAG +>gi|49175990|ref|NC_000913.2|:1739264-1739554 +ACAGGATCGTTCCCGACTCACTATGGATAGTCATTTCGGCAAGGGTTCCTCCTTTCCCTCTGTTCTACGTCGGATTATAGACTCGCGGTTTTTTCTGCGAGATTTCTCACAAAGCCCAAAAAGCGTCTACGCTGTTTTAAGGTTCTGATCACCGACCAGTGATGGAGAAACTATGAGTTCATCGTGTATAGAAGAAGTCAGTGTACCGGATGACAACTGGTACCGTATCGCCAACGAATTACTTAGCCGTGCCGGTATAGCCATTAACGGTTCTGCCCCGGCGGATATTC +>gi|49175990|ref|NC_000913.2|:656409-656762 +TTTTATCACTTTTTAGTAAAGTTACACTGGACAAAGCGTACCACAATTGGTGTACTGGTAACCGACACAGCATTTGTGTCTATTTTTCATGTAAAGGTAATTTTGATGTCTAAGATTAAAGGTAACGTTAAGTGGTTTAATGAGTCCAAAGGATTCGGTTTCATTACTCCGGAAGACGGCAGCAAAGACGTGTTCGTACACTTCTCTGCAATCCAGACTAATGGTTTTAAAACTCTTGCTGAAGGTCAGCGCGTAGAGTTCGAAATCACTAACGGTGCCAAAGGCCCTTCTGCTGCAAACGTAATCGCTCTGTAAGATACGTCAGCAAGAATTCAAAACCCGCTTAATCAGCG +>gi|49175990|ref|NC_000913.2|:655085-655334 +GGCCACCGACAAATAACACACCAGTGGCGGAATACCCTTTAATGATGTAGCGAGCTACACCCACAATAACCACAACCCCAATAAGGAGCTCAATGAATGTCAGCATAATTTTTCCTGTCTCCAGGCCCCAAAGTAAATAATAAAAAATTCCTAAAGCTTAAGGAAAAAATATGCCCAATAAATTGGCGATGAATGCTGATTAAAATCAAGAAAAACTGCCATTAAGACATTGAAGTTGCTGTTTTTATA +>gi|49175990|ref|NC_000913.2|:1018801-1019385 +GGGTAGCGATTTCAGGAGTGATCGCGTACTCAACACCGCCAGCGAAGACCGGAGAAACGCCGGTGTCGTGGTTTTTACCATAAACGTTGGATTTAGTGTCTGCACGCCATACCATGCCACCCAGACGAGTGTAGATGTCCAGGTCGTCAGTGATTGGGTAACCCAGTTTAGCGGTCAGTTGAACGCCCTGAGCTTTGTATGCACCGTTTTCAACGCTGCCTTTGTACGGCATACGACCTAACCAGTCGTAACCCATTTCAAAGCCAACATACGGGTTAACCTGGTAACCACCAAAAGCACCAGCGCCCAGTTGGTTTTCATGGGTCGGGCCATTGTTGTTGATGAAACCAGTGTCATGGTACTGGGACCAGCCCAGTTTAGCACCAGTGTACCAGGTGTTATCTTTCGGAGCGGCCTGCGCTACGGTAGCGAAACCAGCCAGTGCCACTGCAATCGCGATAGCTGTCTTTTTCATTTTTTGCGCCTCGTTATCATCCAAAATACGCCATGAATATCTCCAACGAGATAACACGGTTAAATCCTTCACCGGGGGATCTGCTCAATATTAACTCTACCGATATCTT +>gi|49175990|ref|NC_000913.2|:3439903-3440762 +CGTCAGCGCAACGCTCTGCTGCAACCTGAGCTGCAAACGGAGTGGATTTGCGAGAACCACGGAAACCGGAACCACCGGCTGTTGCCCAACCCAACGCGTTACCCTGACGATCAGTGATAGTCACGATGGTGTTGTTGAAAGAAGCATGGATATGAGCCACGCCGTCAGAGACTTGTTTTCTTACACGTTTACGTGCACGAATTGGTGCCTTTGCCATTATTCAATCACCCCGATTATTTCTTGATCGGTTTGCGCGGACCCTTACGGGTACGTGCGTTGGTCTTGGTACGCTGACCGCGAACCGGGAGACCACGACGATGACGCAAACCGCGATAGCAACCAAGATCCATCAGGCGCTTGATGCTCATGCTGATTTCACGGCGCAGATCACCTTCAACGACAAATTTGGCAACTTCGTCACGCAGCGTGTCGATTTGTCCTTCAGACAGCTCACTGATCTTAACATCTTCAGCGATACCCGCTGCAGCCAGGATGGCTTTAGAACGGGTCTTGCCGACGCCATAAATCGAAGTTAATGCGATTACGGCATGCTTATGATCAGGAATGTTAATGCCTGCTATACGGGCCACTATGCACTCCTACTATTTAATATGTACGTTCCATGCTGAAAAGCCCGTTTTCAGGATACTCAAATGGAAACGCACAGACATACAAAAGATTGGCTGGCTAATCTAGCCAGCTCAACCCAACTTTGCAAGAAAAATATGCGAAAAAATCAGCCTTGGCGCTGTTTATGCTTCGGCTCGGCACTGCAAATCACACGGATGACACCATCACGCTTAACGATTTTGCAGTTACGGCATAATTTCTTGACGGAAGCACGAACTTTCATTTTTAC +>gi|49175990|ref|NC_000913.2|:3964083-3964507 +AAAGAGTTCCTCGACGCTAACCTGGCGTAAGGGAATTTCATGTTCGGGTGCCCCGTCGCTAAAAACTGGACGCCCGGCGTGAGTCATGCTAACTTAGTGTTGACTTCGTATTAAACATACCTTATTAAGTTTGAATCTTGTAATTTCCAACGCTTCCCGTTTTATCTTAAATGCGAAGTGAACAGATTTCTGGCTCGTCACTCAATCCGTCTTGTCGTTTCAGTTCTGCGTACTCTCCTGTGACCAGGCAGCGAAAAGACATGAGTCGATGACCGTAAACAGGCATGGATGATCCTGCCATACCATTCACAACATTAAGTTCGAGATTTACCCCAAGTTTAAGAACTCACACCACTATGAATCTTACCGAATTAAAGAATACGCCGGTTTCTGAGCTGATCACTCTCGGCGAAAATATGGGGCT +>gi|49175990|ref|NC_000913.2|:3472497-3472789 +GCTTCCAGCGCAGGCACGTTGCTTTTCGCAACTTTGCGAGCACGTGGTTTGCGTACCAGCTGGTTAACTGTTGCCATTAAATAGCTCCTGGTTTTAGCTTTTGCTTCGTAAACACGTAATAAAACGTCCTCACACAATATGAGGACGCCGAATTTTAGGGCGATGCCGAAAAGGTGTCAAGAAATATACAACGATCCCGCCATCACCAGGCCATCTGGCTGGGGTGCTTAACCGTAAGTCTGACGAAATCAGTATAGTCAATGAGAATGATGTCGTTCGAAATTTGACCAGT +>gi|49175990|ref|NC_000913.2|:4494324-4494582 +AATGCTGAAAATTTCAGCACTTAGCGAGGTGCGAGCAAGCTGGCGCTTGCATGGTGGCGTGCGACAGGTATAATCCACAACGTTTTCCGCATACCTCTTCAGTGCCGAAGTGGCGAAATCGGTAGACGCAGTTGATTCAAAATCAACCGTAGAAATACGTGCCGGTTCGAGTCCGGCCTTCGGCACCAAAAGTATGTAAATAGACCTCAACTGAGGTCTTTTTTTATGCCTGAAATCCAGTGTTTATCTATCTTTCCC +>gi|49175990|ref|NC_000913.2|:3944974-3945201 +TAATTAGGGGCGTAGTTCAATTGGTAGAGCACCGGTCTCCAAAACCGGGTGTTGGGAGTTCGAGTCTCTCCGCCCCTGCCAGAAATCATCCTTAGCGAAAGCTAAGGATTTTTTTTATCTGAAATAACCCTCTCCGAAGTAAATCCTTCTACCGGCATCCTTGCCAGCCATTCATATTAATACACTTCATCCAGCACGTTAATTTTCAAAAGATCGCGAATCAACGC +>gi|49175990|ref|NC_000913.2|:2685336-2685557 +CGGTCACCGTCATGCCGGTAATACCGACTTCGGCCAGTGCTTCGCGGACATCGTCCAGCTTGAAGGGTTTTATAATCGCATCAATCTTTTTCATGCTATTCCTTGAAAAGGTCGCCTGTCTTTTGATCTGCTAAACGTAACACATAACGCCAATTCATTCCTTGAAATCGTTTGCATCCAGCTCGTGTCGGGAAAGCAGTTTATAAAATTCTGTCCGGTTG +>gi|49175990|ref|NC_000913.2|:1797676-1798911 +GGTATACGCGAGAACGCGCACCGTAGTAGCCTTTAGCTTGTTTCAAAATTTTCTTGTGACGTGCACGTGCAATAACACCACGTTTTACGCGAGCCATATGTGCTCTCCTGTATCTATATTCTAATTAAAAAGTTAAAAACGTTAACGGCTTATGCGTACGGCAGGCACGCGATTACCAGGCCCAGATCGCCTTTGGAAACCATGGCTTTCGGACGCAGGTGACGTTTACGTTTGGTCGCTTTTTTGGTCAGAATGTGACGCAGGTTAGCGTGCTTGTGCTTAAAACCACCTTTACCGGTTTTTTTGAAGCGCTTAGCAGCACCGCGTACGGTCTTAATTTTTGGCATTTTAATAACTTCCACTTCGCATTGTTAATAAACGAAACAAAGGCGAACAAAGCCTGTGAAGCCCGAAGGCTCCACAGACAGTGCTACTTGAAGGCCTTACTGTTTCTTCTTAGGAGCGAGCACCATGATCATCTGGCGGCCTTCGATCTTCGTTGGGAAGGATTCGACCACTGCCAGTTCTTGCAAATCGTCTTTCACGCGATTAAGCACTTCCATACCGATTTGCTGGTGCGCCATCTCACGACCGCGGAAACGCAGCGTGATTTTGGCTTTATCACCCTCTTCGAGAAAGCGAATCAGGCTGCGGAGTTTTACCTGATAGTCGCCTTCATCTGTACCAGGACGGAATTTAATTTCCTTAACCTGGATAACTTTTTGCTTTTTCTTCTGTTCCTTAGAAGACTTGCTCTTTTCATAGAGGAATTTGCCGTAATCCATTATACGACAAACCGGCGGCTCGGCGTTAGGGCTGATCTCGACTAAGTCTACTCCGGCTTCTTCTGCTTTCTCCAGAGCTTCTCTCAGACTCACAATACCAAGCTGCTCGCCTTCCAGACCTGTTAAGCGAACTTCCTGGGCGCGAATTTCGCCATTGATACGGTTAGGGCGCGCCGTTTGAACTCGTTTTCCGCCTTTAATACCTTATTCCTCCAATTGTTTAAGACTGCGGCTGCGAATCTCTTGTTGCAGCTTCTCGATCACTTCATTTACGTCCATGCTTCCCAGGTCTTTACCACGGCGGGTGCGAACGGCAACTTTGCCTGATTCCACCTCTTTATCACCACAGACCAGCATATATGGGACGCGACGCAAAGTGTGCTCGCGGATTTTAAAGCCAATCTTCTCATTTCTCAAGTCTGCTTTAACACGAATGCCCGCATTTGATAG +>gi|49175990|ref|NC_000913.2|:3084545-3084867 +AAAAAAGTGGAAATAGGGTGAAGAATTGACCTAAAATAGCCATCCAGATGTTAATCCATCCATACCGATTAACACTCAGACTGCCAGTGTTTTTAACCTGCAGAGTCGTGGTAGGATCCGCTACCACAGAAAATCCACACAACAGTTTGAGCTAACCAAATTCTCTTTAGGTGATATTAAATATGGCAAAACACCTTTTTACGTCCGAGTCCGTCTCTGAAGGGCATCCTGACAAAATTGCTGACCAAATTTCTGATGCCGTTTTAGACGCGATCCTCGAACAGGATCCGAAAGCACGCGTTGCTTGCGAAACCTACGTAAA +>gi|49175990|ref|NC_000913.2|:2626953-2627159 +GTTTCATAGATGCTCAGCAGAATCCCCCACATCCTGAAGGAGGTGTATTCAGACAGGCATCCCACCTGACTTCGAATGATGATTATTCATCACTATAGAGAGCATTGATTCTAAGTGTCATATGAAAGTACCAATTGATATATATCAAACAAAATAACCCTGATTAATGAATTATTACGTTTATCATGTTAATTCATCATTATTAC +>gi|49175990|ref|NC_000913.2|:3881690-3882661 +TCTGTGGCTGGTAACTCATCCTGCAATCGGGCAAGACACTGCTGCCAAAGCGAAAGTGACACGGCGGACTCCACTCGAACAAAAGTCGATAATGACTAAGGCTGAAACATTCATGATTGTTGACGTACGTCGAAAAGACCCTGCTTGAGGGTGACGCACGAACCGCTGTCTGCGGTTATATGCCCGATCAAGATCCTGCAAAACGATCGGGACCGCGGATCATAGCCTAAACTGCGCAAGAGATCTTCTGTTTCTCACAGATTTTCCCGATTTATCCACAGGACTTTCCAGAACTCGCTAAGTGTAAACGATCCTGCCGCGAGGCGGGCACGATTTACGCCGCATATTGGAAAATTTAATGACCATAGACAAAAATTGGCTTAATCGATCTAATAAAGATCCAGGACGATCCTTGCGCTTTACCCATCAGCCCGTATAATCCTCCACCCGGCGCGCCATGCTGGTTTCCACTGGTGTGAGGTCGTACATTTTCCCTGCGAAAAGGTGCGGAAAAGCGCGGTAAATAAGGAAAGAGAATTGACTCCGGAGTGTACAATTATTACAATCCGGCCTCTTTAATCACCCATGGCTTCGGTGTCCATCGTTTCATTTTTCGGCGGATATCCAATAAAGCCATTGAATTTATTCAAGTTTAGGTAGAAATCGCCATGAAACGCACTTTTCAACCGTCTGTACTGAAGCGCAACCGTTCTCACGGCTTCCGTGCTCGTATGGCTACTAAAAATGGTCGTCAGGTTCTGGCACGTCGTCGTGCTAAAGGCCGCGCTCGTCTGACCGTTTCTAAGTAATAAAGCTAACCCCTGAGTGGTTAAGCTCGCATTTCCCAGGGAGTTACGCTTGTTAACTCCCAGTCAATTCACATTCGTCTTCCAGCAGCCACAACGGGCTGGCACGCCGCAAATTACCATTCTCGGCCGCCTGAATTCGCTGGGGCATCCCCGTATCGGTCT +>gi|49175990|ref|NC_000913.2|:2996905-2997189 +TGAAAATAAGGTGTTTTACCTGGGTTGTTACAAAAGGATTGCATTGCGTAAACGCTTTTTATTTACAACAAAATGGGGAAGTATTACGGCGAAGATAAATTGGAGCGGGCGAAGGGAATCGAACCCTCGTATAGAGCTTGGGAAGCTCTCGTTCTACCATTGAACTACGCCCGCTTCGAGATGCGTAAGGCATTATAAACCTTACGCTCTCCTTAGCAAGTGCCACGCTGCTGACTGCTGATTAATTCGCCATCAGCATTTTGGCTTGCTGCCCTGAGGCGGCA +>gi|49175990|ref|NC_000913.2|:4109440-4109715 +CAGCTCTTTACGCAGGTTAGAAACCAGCTCGTGAACCATGTGGCGGCTGCCGTTCAGTTTCCAGTTACCCATCACTAAAGGATGTCGCATTTTAATTCTCCACGCTTATAAGCGAATAAAGGAAGATGGCCGCCCCGCAGGGCAGCAGGTCTGTGAAACAGTATAGAGATTCATCGGCACAAAGGCTTTGCTTTTTGTCATTTATTCAAACCTTCAAGCGATTCAGATAGCGCCAGCTTAATCGGTTCAACAGCGAAGGTCAGCCCCTTTTCGCC +>gi|49175990|ref|NC_000913.2|:3331011-3331272 +TGGTACCACGTTGACGAACGATGATGCTACCCGCCAGAACGGATTCGCCACCGAAACGCTTAACGCCCAGGCGTTTAGCTTCTGAATCGCGACCGTTACGTGTGGAGCCGCCAGCCTTTTTATGTGCCATTTGAAATCTCTCCTCAGGTCTTAGGCGCTGATGCCAGTAATTTTCACATCAGTGAACCACTGACGATGGCCCTGCTGCTTACGATAGTGTTTACGACGACGAAACTTAACGATTTTAACTTTCTCGCCACG +>gi|49175990|ref|NC_000913.2|:393584-393785 +TGGTGGTGACGTAGATCAAGACTGGTCCGCGAACGTGGGTGTTAAATATACCTGGTAATATTCTTCACTCCGAAGAAATACTGGTAATTTAATCTAAATAATGCCCGTCAAGGATTTGACGGGCATTACTGCAAAGGACGCGCAAATGTTATCTGTAGTTAAACCTCTTCAGGAATTTGGTAAGCTCGATAAATGTTTGTC +>gi|49175990|ref|NC_000913.2|:3054751-3056464 +GAAGAGTGGGATATCCCTCTTCCTGCGGTGGTTACACCGTCGAAAGTCTGGGAGTGGTAAGGGCGATACACCCGCATCGCCCTGATTGACATCGTTGATTCTTTGACCTAATTTAGTGAGTAAGGGTAAGGGAGGATTGCTCCTCCCCTGAGACTGACTGTTAATAAGCGCTGAAACTTATGAGTAACAGTACAATCAGTATGATGACAAGTCGCATCATAACCCTTCTCCTTCAAGCCCTCGCTTCGGTGAGGGCTTTACCGTTACAGCCCCATGCTGCCCTGCCATCGTAAATCCCCATTAAATAAACACAACGCATTGATCTGACTTTGATTTATTTTCTGGAGCAGACTCGCAAAGTAGAATGCGCAACGCGGCAACGGTGTGGAGAAGGGATAAAAAAACGGGCAAGTCAGTGACCTGCCCGTTGATTTTCAGAGAAGGGGAATTAGTACAGCAGACGGGCGCGAATGGTACCCGGAATAGCTTTCATTGCCTGCAGCGCTTTTTCGGCAACGTCTTCGTCGGCTTCAATATCAATAACCACATAACCCATCTGGGCGGAAGTTTGCAGATATTGCGCGGCGATGTTGACGCCCTGCTCGGCGAAGATTTTGTTCAGCGCAGTTAGCACGCCCGGACGGTTTTCGTGGATGTGCATCAGACGACGCCCACCGTGCAGTGGCAGCGAGACTTCCGGGAAGTTCACCGCAGAGAGCGTTGAGCCATTGTCAGAATACTTGATCAATTTACCCGCAACTTCCAGGCCGATATTCTCCTGCGCTTCCTGAGTCGAACCGCCAATGTGTGGCGTCAGAAGGACGTTGTCGAATTCACACAGCGGAGAGGTAAATGGATCGCTATTGGTCGCCGGTTCCGTCGGGAATACGTCGATTGCCGCCCCCGCCAGATGTTTGCTCGCCAGCGCATCACACAGCGCCGGAATATCCACCACAGTACCGCGCGAAGCATTAATCAGCAGCGAGCCGGGCTTCATTAGTGAAATTTCTTTCGCGCCCATCATATTTTTGGTGGACGGATTCTCTGGTACATGCAGACTCACCACATCGCTCATATTCAGCAGGTCAGAAAGATGCTGTACCTGAGTGGCGTTGCCCAGCGGCAGTTTATTTTCAATATCATAAAAGTAAACATACATTCCCAGCGATTCAGCCAGAATGCCCAATTGCGTACCAATATGACCGTAGCCGATGATACCCAGCTTTTTGCCGCGCGCTTCAAAAGAACCCGCCGCCAGTTTGTTCCACACGCCACGGTGCGCTTTAGCATTGGCTTCCGGCACGCCGCGCAATAGCAGCAGCAGTTCGCCAATCACCAGCTCCGCAACAGAGCGCGTATTTGAGAACGGTGCGTTAAATACCGGGATCCCGCGCTTTGCCGCCGCATCCAGATCAACCTGGTTTGTTCCGATACAGAAACAGCCAATAGCGACCAGTTTTTCTGCGGCGTTGATCACGTCTTCAGTCAGATGGGTACGGGATCGCAGGCCGATGAAGTGGGCATCGCGGATGGATTCTTTTAATTGTTCATCATCCAGCGCGCCTTTGTGAAATTCGATGTTGGTGTAACCAGCTGCACGAAGGCTTTCCAGCGCCTTTTGGTGCACGCCTTCTACCAGCAGAAACTTAATCTTGTCTTTCTCCAGCGATACCTTTGCCATTTACCCAATCCTGTCTTTTGAAATGTTGTGTG +>gi|49175990|ref|NC_000913.2|:1262763-1263159 +TTAATGCTAGCGTTACCGTCCGCTATCGTCTATGTTCAAGTTGTCTTAATTGCCAGAATCTAACGGCTTTCGGCAATTACTCCAAAAGGGGGCGCTCTCTTTTATTGATCTTACGCATCCTGTATGATGCAAGCAGACTAACCCTATCAACGTTGGTATTATTTCCCGCAGACATGACCCTTTTAGCACTCGGTATCAACCATAAAACGGCACCTGTATCGCTGCGAGAACGTGTATCGTTTTCGCCGGATAAGCTCGATCAGGCGCTTGACAGCCTGCTTGCGCAGCCGATGGTGCAGGGCGGCGTGGTGCTGTCGACGTGCAACCGCACGGAACTTTATCTTAGCGTTGAAGAGCAGGACAACCTGCAAGAGGCGTTAATCCGCTGGCTTTGCG +>gi|49175990|ref|NC_000913.2|:3276689-3276922 +TAAGTCCTTTCGTAAAACTTTCGTTTCATTTCGTTTTGCCTATTAACGCCTTTCTATTAAGCAAATGCAAGCCCACCTTGCCCATTGACGCAAGCTACTCTCGTTTCAGTGACTTTCATTATGTTTCTTTTGTGAATCAGATCAGAAAACCATTATCTTTCGTTTTATTTTTATCTCACCATGACGCAGTATCAACTGAAACAAAACGAAAGATTAATATCGCAGTAATCTGA +>gi|49175990|ref|NC_000913.2|:4010711-4011865 +CGTTGTTCCAGATCGCTAAACTGGTGAGACAGGGCGGATTGCGTCTGATGCAACGTCGCCGCAGCGGCTGCGAGCGAGCCGCAGTTCCGCAACGCTTGTAGCGTTTTCAGGTGTTTTACTTCGATCATGAAAGTCCTTCACTTCGGCATGAATAATTTGCGCTTGAGGAATATACAGTAACCGCCAATTATGGATGTGTAAACATCTGGACGGCTAAAATCCTTCGTCTTTTAAATTTATGGTGCGTTGGCTGCGTTTCTCCACCCCGGTCACTTACTTCAGTAAGCTCCCGGGGATGAATAAACTTGCCGCCTTCCCTAAATTCAAAATCCATAGGATTTACATATAATTAGAGGAAGAAAAAATGACAATATTGAATCACACCCTCGGTTTCCCTCGCGTTGGCCTGCGTCGCGAGCTGAAAAAAGCGCAAGAAAGTTATTGGGCGGGGAACTCCACGCGTGAAGAACTGCTGGCGGTAGGGCGTGAATTGCGTGCTCGTCACTGGGATCAACAAAAGCAAGCGGGTATCGACCTGCTGCCGGTGGGCGATTTTGCCTGGTACGATCATGTACTGACCACCAGTCTGCTGCTGGGTAACGTTCCGGCGCGTCATCAGAACAAAGATGGTTCGGTAGATATCGACACCCTGTTCCGTATTGGTCGTGGACGTGCGCCGACTGGCGAACCTGCGGCGGCAGCGGAAATGACCAAATGGTTTAACACCAACTATCACTACATGGTGCCGGAGTTCGTTAAAGGCCAACAGTTCAAACTGACCTGGACGCAGCTGCTGGACGAAGTGGACGAGGCGCTGGCGCTGGGCCACAAGGTGAAACCTGTGCTGCTGGGGCCGGTTACCTGGCTGTGGCTGGGGAAAGTGAAAGGTGAACAATTTGACCGCCTGAGCCTGCTGAACGACATTCTGCCGGTTTATCAGCAAGTGCTGGCAGAACTGGCGAAACGCGGCATCGAGTGGGTACAGATTGATGAACCCGCGCTGGTACTGGAACTACCACAGGCGTGGCTGGACGCATACAAACCCGCTTACGACGCGCTCCAGGGACAGGTGAAACTGCTGCTGACCACCTATTTTGAAGGCGTAACGCCAAATCTCGACACGATTACTGCGCTGCCTGTTCAGGGTCTGCATG +>gi|49175990|ref|NC_000913.2|:3443378-3443696 +GTTCAGCGACAGCTTTACCCACAGCTGCAGCCGCGTCTTTGTTACCGGTGTACTTCAGTTGTTCAGCGATAGCTTTTTCTACAGTAGAAGCAGCTACCAGAACTTCAGAACCGTTCGGTGCAATTACCTGTGCGTAAATGTGACGCGGGGTACGATGTACCACCAGGCGAGTTGCGCCCAGCTCCTGGAGCTTGCGGCGTGCGCGGGTCGCACGACGGATACGAGCAGATTTCTTATCCATAGTGTTACCTTACTTCTTCTTAGCCTCTTTGGTACGCACGACTTCGTCGGCGTAACGAACACCCTTGCCTTTATAAG +>gi|49175990|ref|NC_000913.2|:4603625-4603954 +GCGGCTGGAGATGCGTTCGCCAGATGACATCTTCATAGAGCGGTGCGGAACGATAGGCCATAATCGGGATAGTAATCTAAATGATAATGATTGCTAATCATAGCGATAGGTTTACCCGATAGCAAGGGATTTATCTGGCTTGCAAATGATAAAAATTATCATATGATATTGGTTATCATTATCAATGAAAGAGATGAAATCATGTTGCAACGTACGCTGGGCAGTGGCTGGGGAGTGTTGCTGCCGGGATTGCTGATTGCAGGGCTGATGTATGCGGATTTATCGTCAGATCAGTGGCGGATTGTCATTCTGATGGGATTAGTATTGAC +>gi|49175990|ref|NC_000913.2|:1744443-1744688 +GTCACCTACAACGTTGCGTTCATAGCTCAGTTGGTTAGAGCACCACCTTGACATGGTGGGGGTCGTTGGTTCGAGTCCAATTGAACGCACCATCCTGCGTCCGTAGCTCAGTTGGTTAGAGCACCACCTTGACATGGTGGGGGTCGGTGGTTCGAGTCCACTCGGACGCACCAGATTTTCTTAATCTGGTCTTCTCCTTTTTCCCTCTGTTTCTTCTCTGTATCCAATACGTTAAAAGATTTACA +>gi|49175990|ref|NC_000913.2|:3331327-3331707 +ATTTTGACTTCTTCACCGTTTGCGATCATCAGCACTTCAGCGAACTCAACAGTTTCGCCAGTTGCGATGTCCAGCTTTTCCAGGCGAACGGTCTGACCTTCGCTTACTCGGTGTTGTTTACCACCACTTTGGAAAACCGCGTACATATAAAACTCCGCTTCCGCGCACACCTTTTCAATGATTCAGAGTGCGCTATAAATATTCACAATAGGGCGCGAATATTACGCAAAACGCACGCCTTTGACAAGTGCTACAGTCAATACACGAAGAAAAAAAACACAACTTGTACGGTAACGTTTATCTGTGCCATTTTTTCAGTACAATCACCCTATATTCCTAACCATAAACCCTAAGTTGCCTTTGTTCACAGTAAGGTAATC +>gi|49175990|ref|NC_000913.2|:3441300-3442325 +CCCCGCCATATTCACTTTCAGCGGTAAATGTGTGCTCTGTGCAGCATAGACACGACGACCTTGCTGACGTTTCGCGTAGTTTACCACAATGCGGCGTTGACCACGCTCAACAAATACAACAAAGAACGTCACTGCAAATACTAATACTGCAACCAACAGCAACACGAGGAAGTGCAGGTCGCCTTGACGCGCTTGCTCGATAGTATGGGCAATGGCTGGCGGGAGTCCCGCGACAATACCGGCGAAGATAATGATTGAAATACCGTTGCCGATACCTCGTTCAGTAATCTGTTCGCCCAACCACATCAGGAACATGGTTCCTGTGACCAGACTTACAACAGCGGTGAAGTAGAATGCAAAGCCCGGGTTAATCACCAGGCCTTGCATACCAGGCATATTCGGCAGACCGGTAGCAATACCGATCGACTGGAATATTGCCAGCACCAGAGTACCGTAGCGGGTGTACTGGCTGATCTTACGACGACCAGACTCCCCTTCTTTCTTAATTTCTGCCAACGTTGGGTGAACCACCGTCAGCAGCTGGATAATGATCGACGCCGAAATATACGGCATGATCCCCAGAGCAAAGATAGAAGCACGGCTGAGAGCACCACCAGAGAACATGTTAAACATCTCAATGATGGTGCCTCGCTGTTGCTCAAGCAGTTTGGCAAGTACAGCGGCATCAATACCAGGGATCGGAATAAAAGAGCCAATACGGAACACAATCAGCGCACCGATAACAAACAGCAGTCTGCGTTTCAGCTCGCCTAAGCCACCTTTGGCACTTTGAAAATCTAATCCCGGTTGTTTAGCCATCTGCTACTTATTCCTCGATTTTACCGCCAGCAGCTTCGATAGCAGCACGAGCGCCTTTAGTAACACGCAGGCCACGAACAGTTACCGGAGTCGTTACTTCGCCAGCCAGGATCACTTTCGCGAACTCGATCTGGATACCGATAATGTTAGCCGCTTTCAGCGTGTTCAGGTCTACTACACCGCCTTCTACTTTAGCCAGGTCAGAC +>gi|49175990|ref|NC_000913.2|:2174177-2174462 +AACCGCCAAATTGATTGACCTGGTTTGACGTTGCTTCAATCAGCACTTTGCGCGTGCTGTTGCGATCAAATGCCAGCGCCGCTTCGATAACCAACGGATGGGCAGAACAGACTGAACATATGCCGATATGTTCACCAGCTTTATGCCGGGCAATTAACGTTTTCATGTTTTTTCCTTGTTAAATGGCAGGTGCGTTATGCCCTGCCCTCGCAGCCACAATCGGCAATCACTTTGCTCACCACATCGCGCATTGCGGATTTAGCCGACTGCAAATAATCCCGGGGA +>gi|49175990|ref|NC_000913.2|:4105434-4105698 +ATACCGCCATTTGGCCTGACCTGAATCAATTCAGCAGGAAGTGATTGTTATACTATTTGCACATTCGTTGGATCACTTCGATGTGCAAGAAGACTTCCGGCAACAGATTTCATTTTGCATTCCAAAGTTCAGAGGTAGTCATGATTAAGAAAATCGGTGTGTTGACAAGCGGCGGTGATGCGCCAGGCATGAACGCCGCAATTCGCGGGGTTGTTCGTTCTGCGCTGACAGAAGGTCTGGAAGTAATGGGTATTTATGACGGCT +>gi|49175990|ref|NC_000913.2|:938441-938776 +AAATAGCACAAACACGCTATTATTTCCCGACAAACAGGGGCCTTGAAGGCAAGATTGGCGAAAAGCTCGCCTGGCTGGCTGAACAGGATCAAAATAGCCCCATAAAACGCTACCGTTAATGTTATCGTTGCGGTAATGTTGTTACTGTATCCCTGTGGTCGCAGGCTGTGGCCACATCTCCCATTTAATTCGATAAGCACAGGATAAGCATGCTCGATCCCAATCTGCTGCGTAATGAGCCAGACGCAGTCGCTGAAAAACTGGCACGCCGGGGCTTTAAGCTGGATGTAGATAAGCTGGGCGCTCTTGAAGAGCGTCGTAAAGTATTGCAGGTC +>gi|49175990|ref|NC_000913.2|:1702375-1702809 +CTTTACCCACGCTCAACAGTTTAATAACCTGCCAGCAATAAGGGATGTTGTTTAACTTAAGTCAAAAAAATAGCGAATTTTCCAACGACAAAAGCTAAATATCGCAAAAACCTCAGTAAAAATCTTGCTGGAGCTATTATTGCTAAGTAACATTTACCCCCTGAAGTTAATGGATCAATCAAGAGAGATGTGGGCTGTAATGAATCGTCTTATTGAATTAACAGGTTGGATCGTTCTTGTCGTTTCAGTCATTCTTCTTGGCGTGGCGAGTCACATTGACAACTATCAGCCACCTGAACAGAGTGCTTCGGTACAACACAAGTAAGCTCTGCACTTGTGGAGCGACATGCTGCCCGTCCGGGTGCATGTTTTCACTTGTCGGATATTAAACCAGGAATTTATTATCTTGTTCGATGTTGTTGGTGATTGTCAGG +>gi|49175990|ref|NC_000913.2|:3071479-3071928 +CAATACGCGGATGGCGTCATCACCAACAAAAAGTTGATCGCGTTCCTGTCGTACTTCCCATGCAAAACGGCCATGGCTGGTGTCATATTTCAACAAATGCGCCATGCCCGCAGCATCCGCCAGTTCGTTGATTGCCACCACGGTAATTTCCGCCCGGCGTCCGGATTCATACAAAGCACGAACCACATTACGCCCGATGCGACCGAAGCCATTTATCGCTACGCGTACGGTCATAGATCTCCTGCAAGGTTTTCCCTGAGCAAATTTGCCAGACAGAGTAATCCAGCAAATCGTCCGGGGAAACCTTACCTGTCGCAAACTGCGACTGATTGGTTAATTGTCGAACATTTAATCGACTGAAACGCTTCAGCTAGGATAAGCGAAACGTGGAATAAAAGGAATGTTTGTCCAGCCGAAGAAGACATTTATCTGACTCACATCACACTTTT +>gi|49175990|ref|NC_000913.2|:2555273-2555549 +ACGCCATCACGCTGCGTACAATTTCTTCAATCTGTTTTTGATCCATGATATGTTATCTCCGCGTCATCAGAAGAACAGTGACGGATCGCCCGCCCGTTTGGTCAGGCGACCGTTTGCCATAATGCCCATGCTTTCCAGCCAGCGTTCAAACTCCGGTGACGGGCGCAGGTTGAGTAACTGACGCACAGTGGCGGTATCGTGGAATGCGGTGGTCTGGTAGTTGAGCATGATGTCATCACCCAGCGGCATCCCCATGATGTAGTTGCAGCCTGCGGT +>gi|49175990|ref|NC_000913.2|:2555708-2555925 +CCCAATAAAGCCGACCACGGTGTTGACGATAAACGGATCGTAATGACGCGCCAGCCCGTAGTTACGTGCTTCCATCGTTACCTGGTCTGCGCCGAAGTTAGCGCCAGCGGATAGCGCAGAGCCTTGTCCGGTTTCGAAGTAGAGGCAGTTTTCCCCGGCGATACGGTTGAACTCCGCGCCCACTGCGCGCGCTTCGTCGAGCATCGCCAGCTCCACG +>gi|49175990|ref|NC_000913.2|:4271997-4272239 +TTGGAATGCATTACCCGGAGTGTTGTGTAACAATGTCTGGCCAGGTTTGTTTCCCGGAACCGAGGTCACAACATAGTAAAAGCGCTATTGGTAATGGTACAATCGCGCGTTTACACTTATTCAGAACGATTTTTTTCAGGAGACACGAACATGGCCAGCAGAGGCGTAAACAAGGTTATTCTCGTTGGTAATCTGGGTCAGGACCCGGAAGTACGCTACATGCCAAATGGTGGCGCAGTTGC +>gi|49175990|ref|NC_000913.2|:3012980-3013194 +GCTGGTTTTCCCCCCAGCACCAACAACAGAAATCACAGTGGGACGTTTCTGCGCACCTAAATCAATGACTAATGCCGATGGGTCAACTATACTTTTCACGAGTCTTTATGACCTCTCTGGGATAAATTATCCCCAACTTAATCCATCAGGAAGTAACGCAATTATCAGGCGTTATTAGCCCCTATAAATAATGGAACCACTATGTCAGCCATCG +>gi|49175990|ref|NC_000913.2|:2534208-2534454 +TACTGTCATTGAATTTGATCTGCCGCTGCTGGAAGAGAAAGCCAAGTCTACCCTGACTCCGGTTGTTATCTCCAACATGGACGAAATCAAAGAACTGATCAAACTGTCCGGTAGCGTAACCGTGGGTGAAACCCCGGTTATCCGCATCAAGAAGTAATTCTTGCCGCAGTGAAAAATGGCGCCCATCGGCGCCATTTTTTTATGCTTCCGCCAGCGGCGGCAAAATCAATTCATCGCTCTCATGCT +>gi|49175990|ref|NC_000913.2|:2515979-2516199 +ACTAACCCGGACGTAGACTGTATTACAAAAGCGGCAAAAAGCAGAGACAAAAAACCCCCGCTTTGCAGCGAGGGTTGGAAATTTGGTGGAGCTAAGCGGGATCGAACCGCTGACCTCTTGCATGCCATGCAAGCGCTCTCCCAGCTGAGCTATAGCCCCACGATGCGTTTACGTACCAAGTTTGCTGGGTGCAAAATTTGGTGGAGCTAAGCGGGATCGA +>gi|49175990|ref|NC_000913.2|:1150978-1151189 +GAAGAAGAGTTTGATACTGAGATTCCGGACGAAGAAGCTGAGAAAATCACCACCGTTCAGGCTGCCATTGATTACATCAACGGCCACCAGGCGTAAGTGAACATCTCCAGGCGGTCGTTCGACCGCCTGAGTTTTATCTTTTTGTCCCACTAGAATCATTTTTTCCCTCCCTGGAGGACAAACGTGTCTAAGCGTCGTGTAGTTGTGACCG +>gi|49175990|ref|NC_000913.2|:4117321-4117604 +ATGTCACAAAGCTCGGAAGTATCGTATTTCATAGGCTTAACATTCAGTTGCTGCGAGAATTTTCAGTATATCGCGCTATGTGGGCTGTTGGCAAAATCATCAATTGTTAATTGATATTTGTCAGTTATGCTGCCCACTGGCTTAGGAATATCCCTAAAACAAACAGCAGGTTAGTCAGTAACGCTCCCTTGACAGTACGTTCCAGCATTGGTCGCATCGCCACCGGGTCCATTTCCCGCATCACATAACGGGCTTGCTTCACCAGTAATGGTGCCGCCAGCAG +>gi|49175990|ref|NC_000913.2|:1300798-1301168 +AATACCTATACCCGGAATATGTACATTGTGAAGCACTAATGGCAATACGTGGGGCAGGAGTGTCCTGCTCCACGGTGTCTGATTTTTATCGCATTACAGAAGGCACAGGCCAGAAGGTAGGGCAATGTTAAAATTTATTCTACGTCGCTGTCTGGAAGCGATTCCGACGCTATTTATTCTTATTACTATTTCGTTCTTTATGATGCGCCTCGCGCCGGGAAGCCCTTTTACCGGCGAACGTACTTTACCGCCAGAAGTGATGGCCAATATCGAAGCGAAATATCATCTTAATGATCCGATCATGACACAGTATTTCAGCTACCTGAAACAACTGGCGCACGGTGATTTCGGTCCATCGTTTAAATATAAA +>gi|49175990|ref|NC_000913.2|:3443896-3444135 +AGCTGCAGCTTCTTAGTGAAGCCTTCGGTAACACCGATAACCATTGAGTTCAGCAGGGCACGCGCGGTACCAGCCTGTGCCCAACCGTCTGCGTAACCATCACGCGGACCGAAGGTCAGGGTATTATCTGCATGTTTAACTTCAACAGCATCGTTGAGAGTACGAGTCAGCTCGCCGTTTTTACCTTTGATCGTAATAACCTGACCGTTGATTTTTACGTCAACGCCGGCAGGAACAAC +>gi|49175990|ref|NC_000913.2|:1146804-1147070 +GGCAACTGGGGAAAGACCAAACCGGGCGGCGACGATACCTTGACACGTCTAACCCTGGCGTTAGATGTCATGGGAGGGGATTTTGGCCCTTCCGTGACAGTGCCTGCAGCATTGCAGGCACTGAATTCTAATTCGCAACTCACTCTTCTTTTAGTCGGCAATTCCGACGCCATCACGCCATTACTTGCTAAAGCTGACTTTGAACAACGTTCGCGTCTGCAGATTATTCCTGCGCAGTCAGTTATCGCCAGTGATGCCCGGCCTTC +>gi|49175990|ref|NC_000913.2|:2763490-2763711 +GCGATTGTAATCACACTTGATATTATAAAACACAGTTGCACGCATTATTTCCTGGTTGGTAGGGTCATATCTCGATGCTCTTTGAGCAATGTCAACATCGCGTGTTCATGGCTTTCTATATTGTTGATGCCTTGCCCATCCGGACCCCACTCCTTATAGCTCATCATGATGGCTTGTTTTGGGCTCTCTGCTCGTTCTGTTTCGACGTAGAATTTTTTCTT +>gi|49175990|ref|NC_000913.2|:3439416-3439667 +AGCAGAGCCAACAGGTTTTCACCGGTGTTGCCTTTCAGACGTGCTGCTTCTTTGTAGTAGTTACGGAACTGACGCTCCAGCACACCATAGATACGGCGAACTTTTTGCTTTTCACGCAACTGCACACCATAGTCAGACAGACGCGGTTTACGCGCACCGTGCTGGCCAGGAGCTTGTTCAATTTTACACTTGGTATCGATCGCGCGAACGCCAGACTTAAGGAATAAGTCGGTGCCCTCACGACGGCTCAG +>gi|49175990|ref|NC_000913.2|:1797134-1797397 +ACACCTGATGAGACAGGCTTTTTATTTTTCAAAACGCGCATACAAAAAAAGCCTCCACTGGGAGGCTTTCAGGCGCTGTTTTCCGTTTCTCTTCTCACGCGCTAGCCTCCTGGATTCAGGTGCTAAAGTAAAAAAAGAAGCGGAAAATAGCAGCATTCATTGCTTGCGTTACCTTTTGGTACTCTTCAAAAGACCTTTATTGAAAAGGCTACGGCGATAAAAGTCAATGTTTTGATGGCGTTGAAACGAAAAGAGGGAGACTA +>gi|49175990|ref|NC_000913.2|:490493-490730 +TCGCAGTTGCAATTATTGCGTACAGCCAGTACATTCTGGCGTTTTCGAGCACAGGCGCAGGCGGTCAAAGGTTAAACAACTGTTACTTTTGATACGTTTAAAACGCGCCGTGAGTACCACCGTAACAAGCAGGCATACACTTATGACCGCGACTGCACAGCAGCTTGAGTATCTCAAAAATAGCATCAAAAGCATTCAGGACTACCCAAAACCCGGCATTCTTTTCCGCGATGTCAC +>gi|49175990|ref|NC_000913.2|:4374708-4375112 +AAGCGCAGCAATAAGCAATAACGGTACGACAGCTGTGTCGTGCCGTTTGTTTTTTCTGCGATAGTCACAAAGGTAATAGTTGAAATTCCCCTGCCACCTGGCAAAATATCCGTTCAACCATCAGCTTTGCAGGACGACCTGCAAACGCCTCTTTTCACCGGGGACGGCCCCAATTCTCCGGAGCCTGATATGTCCTGGATTATCTTAGTTATTGCTGGTCTGCTGGAAGTGGTATGGGCCGTTGGCCTGAAATATACCCACGGCTTTAGTCGTTTGACGCCGAGTGTTATTACTGTGACGGCGATGATTGTCAGTATGGCGCTACTTGCCTGGGCGATGAAATCGTTACCAGTAGGGACGGCTTATGCCGTGTGGACGGGTATTGGCGCAGTCGGCGCGGCCAT +>gi|49175990|ref|NC_000913.2|:2682753-2683214 +CCAGGGAAAACGGCAGAGTTCAGTTTTTTGTACAGCTCTTCGCTACCACCTTTCGCCAGGATCAGGCCGCCGCGCGGACCCGCCAGGGTTTTGTGAGTGGTGGTAGTAACAACGTGAGCATGAGGAACCGGGTTCGGGTAGACGCCAGCAGCAACCAGGCCCGCAACGTGCGCCATATCAACGAACAGGTAAGCACCGATGCTGTCAGCGATTTCACGCATTTTCGCCCAGTCCACCACGCCGGAATATGCAGAGAAACCACCGATAATCATTTTCGGCTTGTGTTCTTTGGCTTGTTTTTCCAGATCGGCGTAGTCGATATGACCGGTAGCATCGATACCGTAAGGAACGATGTTGTACAGTTTACCGGAGAAGTTAACCGGAGAACCGTGAGTCAGGTGACCGCCATGCGCCAGGTTCATACCCAGAACGGTATCACCTGGTTCCAGCAGCGCGGTGTA +>gi|49175990|ref|NC_000913.2|:3920123-3920527 +CCACCAGCACCAAAACCCACGTAACGATCAGCGGCAAGAATACCGCCTTTAAAACCGCCAACGCCACCACCAGTAACACCAACATCGCCAGAACTTTGAAAGCTTCGCCAAATGCGAATGTCCAGGCCACCCGGCCTTTCGCTGGTGTATGCGCCTGGTGACGCCAGGCAAATATCATAAACAAAACGTTAGGCAGAAAGACTGCCAGGCCCCCGCTTATTGCAGAGACGCCCCAGAAGGGGTCTTTGAGGCTGAACAGCAATCCACTTGCTATCACCACCAGTAACTGAACGAGCAGAAGCTTCCGAGCAACGTTTCGACTCACGAGCGACACAGACATCACGTTTTTCACTCCTGCTCCCTTCGAGGTATGCCGCGTGTCGTATAAAACTTTCTTTAAGGCT +>gi|49175990|ref|NC_000913.2|:3182372-3182658 +TTTTCACGGTCTTCATCATCAAGCACCATTACACCGCGTCCTTCACGCAGCGCAGCCAGTGCATTTTCAACACGTTCGAAAGGCGTACCAAAAGAGGAAAGTAGCGTCTGATTCATGGTAAAAAAACCTCACTAAAATTATGGTTACCAGAATCAGGGCAGTCTTAGGAGTGGCGGCATATAGCCAAAATAACGTGAGCGGGTCCATGCCCGACAGAATCGTTACTCTCTCCCATCCGGACTCTAACCGTCGGCCCCGGAATTACACCGGATCTGCTGTCCTTTGA +>gi|49175990|ref|NC_000913.2|:4011929-4012297 +GCTGTCTGCGGGTCTGATCAATGGTCGTAACGTCTGGCGCGCCGATCTTACCGAGAAATATGCGCAAATTAAGGACATTGTCGGCAAACGTGATTTGTGGGTGGCATCTTCCTGCTCGTTGCTGCACAGCCCCATCGACCTGAGCGTGGAAACGCGTCTTGATGCAGAAGTGAAAAGCTGGTTTGCCTTCGCCCTACAAAAATGCCATGAACTGGCACTGCTGCGCGATGCGCTGAACAGTGGTGACACGGCAGCTCTGGCAGAGTGGAGCGCCCCGATTCAGGCACGTCGTCACTCTACCCGCGTACATAATCCGGCGGTAGAAAAGCGTCTGGCGGCGATCACCGCCCAGGACAGCCAGCGTGCGA +>gi|49175990|ref|NC_000913.2|:4012748-4013029 +ACTCTGCTGGTCGTTCCCGCGTGAAGATGTCAGCCGTGAAACCATCGCCAAACAGATTGCGCTGGCGCTGCGTGATGAAGTGGCCGATCTGGAAGCCGCTGGAATTGGCATCATCCAGATTGACGAACCGGCGCTGCGCGAAGGTTTACCGCTGCGTCGTAGCGACTGGGATGCGTATCTCCAGTGGGGCGTAGAGGCCTTCCGTATCAACGCCGCCGTGGCGAAAGATGACACACAAATCCACACTCACATGTGTTATTGCGAGTTCAACGACATCATGG +>gi|49175990|ref|NC_000913.2|:1260828-1261367 +GGGGATAACAGCGGTGATACGACCTGCGGAAGCACGACGCAGGGCATCAACCATAACGACTAATTCCATCAGGTTGTCGTTAGTAGGGGCACAAGTGGACTGGATGATGAAAATATCACCACCGCGTACATTTTCATTAATTTGTACGCTGACTTCGCCATCGCTAAAGCGACCTACAGCGGCGTCGCCGAGTGAAGTGTACAGGCGGTTGGCAATACGTTGTGCTAGTTCCGGGGTGGCGTTACCAGCAAAAAGCTTCATATCAGGCACGAGAAGAACCTCAGGCATGCGTCCATTGGTGGAAAGAATCTGCCGAAAACTGTGCGGGCCAGGCATGATCCTTTCCAGGCGGTGTATTAAAGAGCGCGATGCAACGTCTGGAACAAGGTGACGTTGTCACCGAAACTCAGCTTGCCCGGCTTAAAGCATGGCTCTGTGCAATGGGGAAAGATTAGCGCCTTTCGCCACAAAGCCATTGAGCCATTCCGGGGCTTGCTCTAGCACCTGGCGGGCTTCAGACTCTGTATCAAATTCAGCAA +>gi|49175990|ref|NC_000913.2|:4297384-4297617 +TTCATCGGTCTCGCTCCAGTTAATCAAATCACGCATACGCGCTCTCGACTACAGTATGCATCTTTTATGCCACATTTTATGTGGGGTCATTCCCTGATATTACGGGCACTATTTATTCAAAACTCTGACGAAAAACAGGCTGTCGTCAGTTTTGACGTGACGAAACGAAATACCGCGTGACAGCCATCACGCGGCAGACATTTTATTTTTTCTCGACGACGGGACCTGCCTGA +>gi|49175990|ref|NC_000913.2|:3537802-3538063 +ACGAACGCCTTGCCTGACATAAAAGTGCCGGAGAATATCTCCGGCATTTTTATTCCACAGCCAAACTCATAATATATTCCGGCAATATTTATCATTTCATTAACAACTGAAACCTTAATTAAACATTAGCCAGTCCGGGTAATTCACTATTCGAATTATATTTTCGCTGCGATATAACCTTGAGCCACATCAACATTGAGTCAGATTATTATTCAAACCAACATTCGCACACATTTTAAGTATTGCTGATAGAAACCATTC +>gi|49175990|ref|NC_000913.2|:2706555-2706903 +AACAATTGTGCAAGAGGACGGTTATCGAGGCGTGCATGTCGATAACGCAGAGACTCAACACCCTGTTTATTGATGCTGATGAATGACAGCTCGTAATTCAGTGACTGACTGGCCAGGTTCATCTGCTGTAATAACGCCCCGGACGCGGGAGTGGCCGAGGCGTTAGCAGAGAATAACAGGCTACCTGTCACTAATGACATGGCAAACCAAAGTTGCTTCATTACTGCGATTGCGTTCCTAAAGTTTGAATTCCTGGCACCTGTACAGCGGCTTGCTGGGTTTGCGCCTGCTCAAACTGAAGCTGTTCAGAGTGGAGTCGGCGTTGCAGTTCGTAATCCTGCAACATTG +>gi|49175990|ref|NC_000913.2|:3438442-3439207 +GATGACCAGCTTGTCCAGGTCGGTACGCTGTTCTACACGCGCTGCTTCAACATTGTAGGCAATACGCTCCACAGGGCTGTAGCATGCGTCGACCAGCAGACGGCCGATTGGGCGCTCATCTTCTTCCGAATGAATTCGGGTAGAAGCCGGCACATAACCACGACCGCGCTGAACTTTGATACGCATGCTAATAGACGCGTTCTCATCGGTCAGGTGGCAGATCACGTGCTGCGGCTTGACGATTTCGACATCACCGTCGTGGGTGATATCGGCTGCAGTCACAGGGCCAATGCCAGATTTATTCAAGGTAAGAATAACTTCATCTTTGCCCTGAACTCTCACCGCCAGCCCTTTCAGGTTGAGCAGGATTTCCAGGATATCTTCCTGAACGCCTTCTTTGGTGCTGTACTCATGTAGTACACCATCAATCTCAACCTCGGTCACCGCGCAACCCGGCATCGATGAGAGCAGAATACGGCGCAGTGCGTTACCCAGAGTATGGCCAAAGCCACGCTCTAAAGGCTCAAGGGTCACCTTGGCGTGCGTCGAACTCACTTGCTCGATATCAACCAGGCGCGGTTTTAGAAACTCTGTCACAGAACCCTGCATTGTGTCCTCTCTTTGGTACTAAGCTTTACTTGGAGTAAAGCTCGACGATCAGGTGTTCGTTAATGTCCGCAGACAGATCAGAACGCTCCGGCTTACGCTTAAACGTACCTTCCATCTTGCCAGCATCAACTTCCAGCCAGGTTGGCTTTTCACGCT +>gi|49175990|ref|NC_000913.2|:107370-107670 +TGCAGGCTGTCCTGGCGAAACAGGAAGCCTGGGAATATGTGACCTTCCAGGACGACGCAGAACTGCCGTTGGCCTTCAAAGCGCCTTCAGCTGTACTGGCATAACGACATTTATACTGTCGTATAAAATTCGACTGGCAAATCTGGCACTCTCTCCGGCCAGGTGAACCAGTCGTTTTTTTTTGAATTTTATAAGAGCTATAAAAAACGGTGCGAACGCTGTTTTCTTAAGCACTTTTCCGCACAACTTATCTTCATTCGTGCTGTGGACTGCAGGCTTTAATGATAAGATTTGTGCGCT +>gi|49175990|ref|NC_000913.2|:3989163-3989427 +AGGCGATACGTCTTGTACCTCTATATTGAGACTCTGAAACAGAGACTGGATGCCATAAATCAATTGCGTGTGGATCGCGCGCTTGCTGCTATGGGGCCTGCATTCCAACAGGTCTACAGTCTACTGCCGACATTGTTGCACTATCACCATCCGCTAATGCCGGGTTACCTTGATGGTAACGTTCCCAAAGGCATTTGCCTTTACACGCCTGATGAAACTCAACGCCACTACCTGAACGAGCTTGAACTGTATCGTGGAATGTCA +>gi|49175990|ref|NC_000913.2|:2848510-2848892 +CTTTTAATTTATTGTTATTAAAGAGATTTTTAAGCTAAAGATGAATTTCGTCGCCGTGTCGACGTGTCATTTCGACATCATCGACATTATTCACCGCAGGGATAATCAACACTGGCACAATTATTGCTTGTAGCTGGCAATAGTTAATGGGAGGCGATATGCACGAAATAACCCTCTGCCAACGGGCACTGGAATTGATCGAACAGCAGGCCGCAAAACACGGCGCAAAACGCGTAACTGGGGTCTGGCTCAAAATTGGCGCATTTTCTTGTGTCGAAACCAGCTCTCTTGCCTTTTGTTTTGATCTGGTTTGCCGCGGCAGCGTGGCGGAAGGTTGTAAACTGCACCTCGAAGAACAAGAGGCCGAATGCTGGTGTGAAAC +>gi|49175990|ref|NC_000913.2|:3442416-3442806 +CGCGACGTACGCCACCGCCAGAACGAGACTTCTGACCTTTGTGACCACGACCACCGGTTTTACCGAGGCCAGAACCGATACCACGACCCAGGCGTTTACCCGCCTTTTTGGAGCCTTCGGCCGGAGACAGAGTATTTAAACGCATCTCTTACTCCTCAACTTTAACCATGAAGGAAACCGCGTTGATCATACCGCGAATAGCAGGAGTATCCTCGCGCTCTACGGTGTGACCAATACGACGCAGACCCAGGCCAAGCAGCGTTGCCTTGTGTTTCGGCAGACGACCGATTGCACTGCGGGTTTGAGTAATTTTAATAGTCTTTGCCATGGTTTATTTCCCCAGAATTTCTTCAACGGATTTACCACGCTTGGCAGCGACCATTTCTGGAG +>gi|49175990|ref|NC_000913.2|:3309298-3309542 +GTAAAGCCTCTCATTAGCCGCGCGAACCTCTGCAACGGAAGATCATTCATAGCAACAATACATTAGTTTCCAGTGAATTGCTGCCGTCAGCTTGAAAAAAGGGGCCACTCAGGCCCCCTTTTCTGAAACTCGCAAGAATTAGCGACGCAGACCCAGGCGCTCGATGAGCTGGGTGTAACGTGCTACGTCTTTACGTTTCAGGTAGTCGAGCAGTTTACGACGCTGAGAAACCATGCGCAGCAGA +>gi|49175990|ref|NC_000913.2|:4177028-4177278 +ACGTAGCTGTTAACCTCGGCATCGACGCTCGTAAATCTGACCAGAACGTACGTGGTGCAACTGTACTGCCGCACGGTACTGGCCGTTCCGTTCGCGTAGCCGTATTTACCCAAGGTGCAAACGCTGAAGCTGCTAAAGCTGCAGGCGCAGAACTGGTAGGTATGGAAGATCTGGCTGACCAGATCAAGAAAGGCGAAATGAACTTTGACGTTGTTATTGCTTCTCCGGATGCAATGCGCGTTGTTGGCCA +>gi|49175990|ref|NC_000913.2|:3444295-3444579 +TCATCTTTACGTTTATAGATGCGCAGACCTGGGCGGCTGACACGCTGAATGCTTTCTACAACAGCTTTGCCCTGGAAATACTTCAGAGTAAGTTCCAGTTCAGGCTTGGTGTCGCCTTCAACTTTAAAATCTTCAATAAAACCTTCTTCCTTCAGCACGTTGGCGATTGCCACTTTCAGCTTGGAGGAAGGCATGGTGACCGCAGCTTTGTTCGCGGCCTGACCGTTACGGATACGGGTCAGCATATCCGCGATCGGATCTTGCATGCTCATCTGTCTTTACTC +>gi|49175990|ref|NC_000913.2|:3444631-3444941 +TCACCGCGCATAGCGGCTTCACGGACCTTAATACGGCTCAACCCGAACTTCCGCAGGAAACCATGCGGACGACCTGTTTGACGGCAGCGGTTACGCTGACGAGACGGGCTGGAATCACGCGGCAGAGTCTGCAGCTTGAGAACAGCGTTCCAACGATCTTCGTCGGAAGCGTTCACATCAGAGATGATCGCTTTCAGTTCAGCGCGTTTCGCGAAGTATTTATCAGCTAAAGCTACGCGTTTTACTTCGCGTGCTTTCATTGATTGCTTAGCCATTTAGTAACCCTACCTTACTTGCGGAACGGGAAGTC +>gi|49175990|ref|NC_000913.2|:698-1386 +GCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGATGGCAGGTTTCACCGCCGGTAATGAAAAAGGCGAACTGGTGGTGCTTGGACGCAACGGTTCCGACTACTCTGCTGCGGTGCTGGCTGCCTGTTTACGCGCCGATTGTTGCGAGATTTGGACGGACGTTGACGGGGTCTATACCTGCGACCCGCGTCAGGTGCCCGATGCGAGGTTGTTGAAGTCGATGTCCTACCAGGAAGCGATGGAGCTTTCCTACTTCGGCGCTAAAGTTCTTCACCCCCGCACCATTACCCCCATCGCCCAGTTCCAGATCCCTTGCCTGATTAAAAATACCGGAAATCCTCAAGCACCAGGTACGCTCATTGGTGCCAGCCGTGATGAAGACGAATTACCGGTCAAGGGCATTTCCAATCTGAATAACATGGCAATGTTCAGCGTTTCTGGTCCGGGGATGAAAGGGATGGTCGGCATGGCGGCGCGCGTCTTTGCAGCGATGTCACGCGCCCGTATTTCCGTGGTGCTGATTACG +>gi|49175990|ref|NC_000913.2|:2309968-2310357 +ACGCGAGTTGCGTTGTAGGTCTGGGTGTACTGAGCAGCCAGGTAGATGTTGTTAGCGTCGTATTTCAGACCACCAGTGTAGGTTTCAGCACGGTCGCCGTTACCGATGTAAGCAGCGGTGTTCTGAGCATCAGTACGTTTGGAGCTGGAGATCGCACCACCGATACCGAAACCTTCGTAATCATAAGTGATAGAACCGCCGACGCCGTCGCCGTTTTGACGCAGTGCGTCACGACCGTTGTTAGTTACGCCACTAGTAAAGCCTTCACCAGATGGGTTGCCGTTTTTACCCTGGTACTGAACAGCAAAGTTCAGGCCGTCAACCAGACCGAAGAAGTCAGTGTTACGGTAGGTCGCGAAGCCGTTACCACGCTGCTGCATGAAGTTGTC +>gi|49175990|ref|NC_000913.2|:189636-189958 +AAATTCGACGTCTGATGCTGTACACAGCGCCAACAATTATTGGTGTCCACGACGTATTTGTGGTATAAAGCGCGCCGGACTTCCGATCCATTTCGTATACACAGACTGGACGGAAGCGACAATCTCACTTTGTGTAACAACACACACGTATCGGCACATATTCCGGGGTGCCCTTTGGGGTCGGTAATATGGGATACGTGGAGGCATAACCCCAACTTTTATATAGAGGTTTTAATCATGGCAACTGTTTCCATGCGCGACATGCTCAAGGCTGGTGTTCACTTCGGTCACCAGACCCGTTACTGGAACCCGAAAATGAAGC diff --git a/2024/ebaiin1/chip-seq/hands-on/hands-on.Rmd b/2024/ebaiin1/chip-seq/hands-on/hands-on.Rmd new file mode 100644 index 0000000..e0e875e --- /dev/null +++ b/2024/ebaiin1/chip-seq/hands-on/hands-on.Rmd @@ -0,0 +1,1133 @@ +--- +title: "ChIP-seq workshop - Hands-on Roscoff 2024" +author: "Elodie Darbo; Stéphanie Le Gras; Delphine Potier; Morgane Thomas-Chollier; Tao Ye;" +date: "2024 Novembre 19-21" +output: + html_document: + fig_caption: yes + toc: yes + toc_depth: 5 + toc_float: yes + number_sections: yes + word_document: + toc: yes + toc_depth: '2' + pdf_document: + fig_caption: yes + keep_tex: yes + toc: yes + toc_depth: 2 +bibliography: references.bib +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, root.dir="~/Documents/Formations/EBAII/2024/ebaiin1/chip-seq/hands-on") +``` + +# - Introduction +## - Goal + +The aim is to : + + * understand the nature of ChIP-Seq data + * perform a complete analysis workflow including quality check (QC), read mapping, visualization in a genome browser and peak-calling. Use command line and open source software for each step of the workflow and feel the complexity of the task + * have an overview of some possible downstream analyses + * perform a motif analysis with online web programs + +## - Summary +This training gives an introduction to ChIP-seq data analysis, covering the processing steps starting from the reads to the peaks. Among all possible downstream analyses, the practical aspect will focus on motif analyses. A particular emphasis will be put on deciding which downstream analyses to perform depending on the biological question. This training does not cover all methods available today. It does not aim at bringing users to a professional NGS analyst level but provides enough information to allow biologists understand what is DNA sequencing in practice and to communicate with NGS experts for more in-depth needs. + +## - Dataset description +For this training, we will use two datasets: + +* a dataset produced by Myers et al [Pubmed](http://www.ncbi.nlm.nih.gov/pubmed/23818864) involved in the regulation of gene expression under anaerobic conditions in bacteria. We will focus on one factor: **FNR**. The advantage of this dataset is its small size, allowing real time execution of all steps of the dataset +* a dataset of ChIP-seq peaks obtained in different mouse tissues for the p300 co-activator protein by Visel et al. [Pubmed](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2745234/); we will use this dataset to illustrate downstream annotation of peaks using R. + +# - Downloading ChIP-seq reads from NCBI +**Goal**: Identify the datasets corresponding to the studied article and retrieve the data (reads as FASTQ files) corresponding to 2 replicates of a condition and the corresponding control. + +## - Obtaining an identifier for a chosen dataset +NGS datasets are (usually) made freely accessible for other scientists, by depositing these datasets into specialized databanks. [Sequence Read Archive (SRA)](http://www.ncbi.nlm.nih.gov/sra) located in USA hosted by NCBI, and its European equivalent [European Nucleotide Archive (ENA)](http://www.ebi.ac.uk/ena) located in England hosted by EBI both contains **raw reads**. + +Functional genomic datasets (transcriptomics, genome-wide binding such as ChIP-seq,...) are deposited in the databases [Gene Expression Omnibus (GEO)](http://www.ncbi.nlm.nih.gov/geo/) or its European equivalent [ArrayExpress](https://www.ebi.ac.uk/arrayexpress/). + +Within an article of interest, search for a sentence mentioning the deposition of the data in a database. Here, the following sentence can be found at the end of the Materials and Methods section: +*"All genome-wide data from this publication have been deposited in NCBI’s Gene Expression Omnibus (**GSE41195**)."* +We will thus use the **GSE41195** identifier to retrieve the dataset from the **NCBI GEO** (Gene Expression Omnibus) database. + +## - Accessing GSE41195 from GEO +1. The GEO database hosts processed data files and many details related to the experiments. SRA (Sequence Read Archive) stores the actual raw sequence data. +2. Search in Google **GSE41195**. Click on the first link to directly access the correct page on the GEO database. +![alt text][geo] +3. This GEO entry is a mixture of expression analysis (Nimblegen Gene Expression Array), chip-chip and chip-seq. At the bottom of the page, click on the subseries related to the chip-seq datasets. (this subseries has its own identifier: **GSE41187**). +![alt text][geo2] +4. From this page, we will focus on the experiment **FNR IP ChIP-seq Anaerobic A**. At the bottom of the page, click on the link "**GSM1010219** - FNR IP ChIP-seq Anaerobic A". +5. In the new page, go to the bottom to find the SRA identifier. This is the identifier of the raw dataset stored in the SRA database. +![alt text][geo3] +6. Click on the identifier **SRX189773** + +## - Downloading FASTQ file from the SRA database +SRA stores sequences in a FASTQ format. + +1. Click on **SRR576933** in SRA ![alt text][sra1] +2. There are statistics on the run that generated the data. ![alt text][sra2] +2. Click on FASTA/FASTQ download. On the next page, there is a link to the FASTQ file. For efficiency, this file has already been downloaded and is available in the "data" folder (SRR576933.fastq.gz) +![alt text][sra3] + +**tip**: To download the replicate and control datasets, we should redo the same steps starting from the GEO web page specific to the chip-seq datasets (see step 2.4) and choose **FNR IP ChIP-seq Anaerobic B** and **anaerobic INPUT DNA**. Downloaded FASTQ files are available in the data folder (SRR576934.fastq.gz and SRR576938.fastq.gz respectively) + +**At this point, you have three FASTQ files, two IPs, one control (INPUT).** + +# - Connect to the server and set up your environment +During this training, we will work on the cluster provided by the Institut Français de Bioinformatique (IFB) using JupyterLab through the ondemand system. + +1. Go to [ondemand](https://ondemand.cluster.france-bioinformatique.fr/) +2. Select JupyterLab: Core +![alt text][selectjupyterlab] + +3. Fill the form as such: + - account: 2422_ebaii_n1, + - CPUS: 2 + - Amount of memory: 10G + - Number of hours: 7 +![alt text][jupyterlabform] + +4. Once the job is running, click on Connect to Jupyter +![alt text][launchjupyterlab] + + +## - Set up your working environment +1. Go to your project directory +```{bash eval=FALSE} +cd /shared/projects/ +``` +2. Create a directory that will contain all results of the upcoming analyses. +```{bash eval=FALSE} +mkdir EBAII2024_chipseq +``` +3. Go to the newly created directory +```{bash eval=FALSE} +cd EBAII2024_chipseq +``` +4. Copy the directory containing data + +```{bash eval=FALSE} +cp -r /shared/projects/2422_ebaii_n1/chipseq/EBAII2024_chipseq/data . +``` + +7. Your directory structure should be like this + ``` +/shared/projects//EBAII2024_chipseq +│ +└───data +``` + +If you wish, you can check your directory structure: +```{bash eval=FALSE} + tree +``` + +# - Quality control of the reads and statistics +**Goal**: Get some basic information on the data (read length, number of reads, global quality of datasets) + +## - Generating the FASTQC report +Before you analyze the data, it is crucial to check the quality of the data. We will use the standard tool for checking the quality of data generated on the Illumina platform: [FASTQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). + +1. Create a directory named **01-QualityControl** in which to output results from fastqc +```{bash eval=FALSE} +mkdir 01-QualityControl +``` +2. Go to the directory you've just created +```{bash eval=FALSE} +cd 01-QualityControl +``` + +Your directory structure should be like this +``` +/shared/projects//EBAII2024_chipseq +│ +└───data +│ +└───01-QualityControl <- you should be in this folder +``` +3. Get FastQC available in your environment +```{bash eval=FALSE} +module load fastqc/0.12.1 +``` +4. Check the help page of the program to see its usage and parameters. + +```{bash eval=FALSE} +fastqc --help +``` +5. Launch the FASTQC program on the experiment file (FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz) + * -o: creates all output files in the specified output directory. '.' means current directory. +```{bash eval=FALSE} +fastqc ../data/FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz -o . +``` +6. Wait until the analysis is finished. Check the FastQC result files. +```{bash eval=FALSE} +ls +``` +> FNR_IP_ChIP-seq_Anaerobic_A_fastqc.html FNR_IP_ChIP-seq_Anaerobic_A_fastqc.zip + +7. Go to the directory /shared/projects//EBAII2024_chipseq/1-QualityControl in the tree directory on the left of the jupyterhub window and double click on FNR_IP_ChIP-seq_Anaerobic_A_fastqc.html to visualize the file. +![alt text][fastqc] + +8. Launch the FASTQC program on the replicate (FNR_IP_ChIP-seq_Anaerobic_B.fastq.gz) and on the control file (Anaerobic_INPUT_DNA.fastq.gz) + +**Analyze the result of the FASTQC program:** + + * **How many reads are present in each file ? ** + * **What is the read length ? ** + * **Is the overall quality good for the three samples ? ** + * **Are there any concerns raised by the report ? If so, can you tell where the problem might come from ?** + +10. Once you are done with FastQC, unload it +```{bash eval=FALSE} +module unload fastqc/0.11.9 +``` + +# - Mapping the reads with Bowtie +**Goal**: Obtain the coordinates of each read to the reference genome. + +## - Choosing a mapping program +There are multiple programs to perform the mapping step. For reads produced by an Illumina machine for ChIP-seq, the currently "standard" programs is Bowtie (versions 1 and 2)[@langmead_ultrafast_2009] [@langmead_fast_2012]. We will use **Bowtie version 2.5.1** for this exercise. + +## - Bowtie +1. Load Bowtie +```{bash eval=FALSE} +module load bowtie2/2.5.1 +``` +2. Try out bowtie +```{bash eval=FALSE} +bowtie2 +``` +This prints the help of the program. However, this is a bit difficult to read ! If you need to know more about the program, it's easier to directly check out the manual on the [website](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml). + +3. Bowtie needs the reference genome to align each read on it. The genome needs to be in a specific format (=index) for bowtie to be able to use it. Several pre-built indexes are available for download on bowtie webpage, but our genome is not there. You will need to make this index file. + +4. Create a directory named **02-Mapping** in which to output mapping results +```{bash eval=FALSE} +cd .. +mkdir 02-Mapping +``` +5. Go to the directory you've just created +```{bash eval=FALSE} +cd 02-Mapping +``` + +## - Prepare the index file +1. To make the index file, you will need the complete genome, in FASTA format. It has already been downloaded to gain time (Escherichia_coli_K12.fasta in the course folder) (The genome was downloaded from the NCBI). + +2. Create a directory named **index** in which to output bowtie indexes +```{bash eval=FALSE} +mkdir index +``` +3. Go to the newly created directory +```{bash eval=FALSE} +cd index +``` +4. Try out bowtie2-build +```{bash eval=FALSE} +bowtie2-build +``` +5. Build the index for bowtie +```{bash eval=FALSE} +## Creating genome index : provide the path to the genome file and the name to give to the index (Escherichia_coli_K12) +bowtie2-build ../../data/Escherichia_coli_K12.fasta Escherichia_coli_K12 +``` +6. Go back to upper directory i.e 02-Mapping +```{bash eval=FALSE} +cd .. +``` + +## - Mapping the samples +1. Create a directory named **bam** to put mapping results +```{bash eval=FALSE} +mkdir bam +``` +2. Go to the newly created directory bam +```{bash eval=FALSE} +cd bam +``` +Your directory structure should be like this: +``` +/shared/projects//EBAII2024_chipseq +│ +└───data +│ +└───01-QualityControl +│ +└───02-Mapping +| └───index +| └───bam <- you should be here +``` + + +3. Let's see the parameters of bowtie before launching the mapping: + * -x to specify genome index prefix + * -U to specify file with reads to be mapped + * -3 will trim x base from the end of the read. As our last position is of low quality, we'll trim 1 base. + * -S will output the result in SAM format + * --mm allows many concurrent bowtie processes on the same computer to share the same memory image of the index + * 2> FNR_IP_ChIP-seq_Anaerobic_A.out will output some statistics about the mapping in the file FNR_IP_ChIP-seq_Anaerobic_A.out +```{bash eval=FALSE} +## Run alignment +## Tip: first type bowtie command line then add quotes around and prefix it with "sbatch --cpus 10 --wrap=" +module load formation/2421 + +sbatch -p fast -o FNR_IP_ChIP-seq_Anaerobic_A.mapping.out --cpus-per-task 10 --wrap="bowtie2 -p 10 --mm -3 1 -x ../index/Escherichia_coli_K12 -U ../../data/FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz -S FNR_IP_ChIP-seq_Anaerobic_A.sam" +``` +This should take few minutes as we work with a small genome. For the human genome, we would need either more time and more resources. + +**Analyze the result of the mapped reads: +Open the file FNR_IP_ChIP-seq_Anaerobic_A.mapping.out (for example using the ` less ` command), which contains some statistics about the mapping. How many reads were mapped? How many multi-mapped reads were originally present in the sample? To quit less press 'q'** + +Bowtie output is a [SAM](https://samtools.github.io/hts-specs/SAMv1.pdf) file. The SAM format corresponds to large text files, that can be compressed ("zipped") into a BAM format. The BAM files takes up to 4 time less disk space and are usually sorted and indexed for fast access to the data it contains. The index of a given .bam file is named .bam.bai or .bai file. Some tools require to have the index of the bam file to process it. + +4. multimapped reads are given a very low mapping quality (below 10). Remove reads which mapping quality is below 10, sort the sam file and create a bam file using samtools [@li_sequence_2009]. samtools view is used to filter data based on mapping quality and samtools sort is used to sort data based on genomic coordinates. + * -@: number of processors to use + * -q: to set a threshold to the mapping quality + * -b: to output a BAM file (it is a SAM file by default) + * -o: to specify a output file name +```{bash eval=FALSE} +## First load samtools +module load samtools/1.18 +## Then run samtools +samtools view -@ 2 -q 10 -b FNR_IP_ChIP-seq_Anaerobic_A.sam | samtools sort -@ 2 - -o FNR_IP_ChIP-seq_Anaerobic_A.bam +``` + +5. Create an index for the bam file +```{bash eval=FALSE} +samtools index FNR_IP_ChIP-seq_Anaerobic_A.bam +``` + +6. Compress the .sam file (you could also delete the file) +```{bash eval=FALSE} +gzip FNR_IP_ChIP-seq_Anaerobic_A.sam +``` + +7. Once it's done, unload the tools you used +```{bash eval=FALSE} +module unload samtools/1.18 bowtie2/2.5.1 +``` + +## - Map the second replicate and the control +1. Repeat the steps above (3 -> 6 - Mapping) for the files FNR_IP_ChIP-seq_Anaerobic_B.fastq.gz and Anaerobic_INPUT_DNA.fastq.gz in the directory named "**bam**" within the directory 02-Mapping. + +**Analyze the result of the mapped reads: +How many reads were mapped for samples Anaerobic_INPUT_DNA and FNR_IP_ChIP-seq_Anaerobic_B?** + +# - Estimating the number of duplicated reads +**Goal**: Duplicated reads i.e reads mapped at the same positions in the genome are present in ChIP-seq results. They can arise from several reasons including a biased amplification during the PCR step of the library prep, DNA fragments coming from repetitive elements of the genome, sequencing saturation or the same clusters read several times on the flowcell (i.e optical duplicates). As analyzing ChIP-Seq data consist in detecting signal enrichment, we can not keep duplicated reads for subsequent analysis. So let's detect them using [Picard](http://broadinstitute.github.io/picard/) [@broadinstitute_picard]. + +1. Go to the directory with alignment files +```{bash eval=FALSE} +cd /shared/projects//EBAII2024_chipseq/02-Mapping/bam +``` +2. Run Picard markDuplicates to mark duplicated reads (= reads mapping at the exact same location on the genome) + * CREATE_INDEX: Create .bai file for the result bam file with marked duplicate reads + * INPUT: input file name to mark for duplicate reads + * OUTPUT: output file name + * METRICS: file with duplicates marking statistics + * VALIDATION_STRINGENCY: Validation stringency for all SAM files read by picard. +```{bash eval=FALSE} +## Load picard +module load picard/2.23.5 + +## Run picard +picard MarkDuplicates \ +-CREATE_INDEX true \ +-INPUT FNR_IP_ChIP-seq_Anaerobic_A.bam \ +-OUTPUT Marked_FNR_IP_ChIP-seq_Anaerobic_A.bam \ +-METRICS_FILE metric + +``` + +To determine the number of duplicated reads marked by Picard, we can run the `samtools flagstat` command: + +```{bash eval=FALSE} +## Add samtools to your environment +module load samtools/1.18 +## run samtools +samtools flagstat Marked_FNR_IP_ChIP-seq_Anaerobic_A.bam +``` + +**Run picard MarkDuplicates on the 2 other samples. How many duplicates are found in each sample?** + +Go back to working home directory (i.e /shared/projects//EBAII2024_chipseq/) +```{bash eval=FALSE} +## Unload picard and samtools +module unload samtools/1.18 picard/2.23.5 +## If you are in 02-Mapping/bam +cd ../.. +``` + +# - ChIP quality controls +**Goal**: This exercise aims at plotting the **Lorenz curve** to assess the quality of the chIP. + +## - Plot the Lorenz curve with Deeptools +1. Create a directory named **03-ChIPQualityControls** in which to put mapping results for IP +```{bash eval=FALSE} +mkdir 03-ChIPQualityControls +``` +2. Go to the newly created directory +```{bash eval=FALSE} +cd 03-ChIPQualityControls +``` +3. Run Deeptools [plotFingerprint](http://deeptools.readthedocs.io/en/latest/content/tools/plotFingerprint.html) [@ramirez_deeptools2:_2016] to draw the Lorenz curve + * -b: List of indexed BAM files + * -plot: File name of the output figure (extension can be either “png”, “eps”, “pdf” or “svg”) + * --numberOfSamples: how many regions are used to plot the graph + * -p: Number of processors to use (2 processors) +```{bash eval=FALSE} +## Load deeptools in your environment +module load deeptools/3.5.4 +## Run deeptools fingerprint +plotFingerprint \ + -p 2 \ + --numberOfSamples 10000 \ + -b ../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam \ + ../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam \ + ../02-Mapping/bam/Anaerobic_INPUT_DNA.bam \ + -plot fingerprint_10000.png +``` +4. If plotFingerprint takes to much time to run. Take the file that has already been prepared for the training. +```{bash eval=FALSE} +cp /shared/home/slegras/2421_m22_bims/slegras/03-ChIPQualityControls/fingerprint.png . +``` +5. Go find the file using the directory tree on the left of the Jupyterlab panel and click on the fingerprint.png file to display it in Jupyterlab. + +**Look at the result files fingerprint.png (add the plot to this report). Give an explanation of the curves?** + +Go back to the working home directory (i.e /shared/projects/2421_m22_bims/\) +```{bash eval=FALSE} +## Unload deepTools +module unload deeptools/3.5.4 +## If you are in 03-ChIPQualityControls +cd .. +``` + +# - Visualizing the data in a genome browser +**Goal**: Check whether the IP worked: visualize the data in their genomic context. + +## - Choosing a genome browser +There are several options for genome browsers, divided between the local browsers (which you need to install on your computer, eg. IGV) and the online genome browsers (eg. UCSC genome browser, Ensembl). We often use both types, depending on the aim and the localization of the data. +If the data are on your computer, to prevent data transfer, it's easier to visualize the data locally (IGV). Note that if you're working on a non-model organism, the local viewer will be the only choice. If the aim is to share the results with your collaborators, view many tracks in the context of many existing annotations, then the online genome browsers are more suitable. + +## - Viewing the raw alignment data in IGV +1. Download the following files from the server onto your computer + * data/Escherichia_coli_K12.fasta + * data/Escherichia_coli_K_12_MG1655.annotation.fixed.gtf + * 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam + * 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam.bai + * 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam + * 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam.bai + * 02-Mapping/bam/Anaerobic_INPUT_DNA.bam + * 02-Mapping/bam/Anaerobic_INPUT_DNA.bam.bai +2. Open IGV on your computer +3. Load the genome + * Genomes / Load Genome from File... + * Select the fasta file Escherichia_coli_K12.fasta located into the data directory +4. Load an annotation file named Escherichia_coli_K_12_MG1655.annotation.fixed.gtf into IGV + * File / Load from File... + * Select the annotation file. The positions of the genes are now loaded. +5. Load the three bam files (FNR_IP_ChIP-seq_Anaerobic_A.bam, FNR_IP_ChIP-seq_Anaerobic_B.bam and Anaerobic_INPUT_DNA.bam) in IGV. + * File / Load from File... + * Select the bam files. +![alt text][igvbam] + +**Browse around in the genome. Specifically go to the following genes: pepT (geneID:b1127), ycfP (geneID:b1108). Do you see peaks (add screenshots to this report).** + +However, looking at BAM file as such does not allow to directly compare the two samples as data are not normalized. Let's generate normalized data for visualization. + +## - Viewing scaled data +[bamCoverage](https://deeptools.readthedocs.io/en/latest/content/tools/bamCoverage.html) from deepTools generates BigWigs out of BAM files +1. Try it out +```{bash eval=FALSE} +## Load deeptools in your environment +module load deeptools/3.5.4 +## run bamCoverage +bamCoverage --help +``` +2. Create a directory named **04-Visualization** to store bamCoverage outputs +```{bash eval=FALSE} +mkdir 04-Visualization +``` +3. Go to the newly created directory +```{bash eval=FALSE} +cd 04-Visualization +``` + +Your directory structure should be like this: +``` +/shared/projects//EBAII2024_chipseq +│ +└───data +│ +└───01-QualityControl +│ +└───02-Mapping +| └───index +| └───bam +│ +└───03-ChIPQualityControls +│ +└───04-Visualization <- you should be in this folder +``` + +4. Generate a scaled bigwig file on the IP with bamCoverage + * --bam: BAM file to process + * --outFileName: output file name + * --outFileFormat: output file type + * --effectiveGenomeSize : size of the mappable genome + * --normalizeUsing : different overall normalization methods; we will use RPGC method corresponding to 1x average coverage + * --skipNonCoveredRegions: skip non-covered regions + * --extendReads 200: Extend reads to fragment size + * --ignoreDuplicates: reads that have the same orientation and start position will be considered only once +```{bash eval=FALSE} +bamCoverage \ + --bam ../02-Mapping/bam/Marked_FNR_IP_ChIP-seq_Anaerobic_A.bam \ + --outFileName FNR_IP_ChIP-seq_Anaerobic_A_nodup.bw \ + --outFileFormat bigwig \ + --effectiveGenomeSize 4639675 \ + --normalizeUsing CPM \ + --skipNonCoveredRegions \ + --extendReads 200 \ + --ignoreDuplicates +``` + +5. Do it for the replicate and the control. +6. Download the three bigwig files you have just generated + * 04-Visualization/FNR_IP_ChIP-seq_Anaerobic_A_nodup.bw + * 04-Visualization/FNR_IP_ChIP-seq_Anaerobic_B_nodup.bw + * 04-Visualization/Anaerobic_INPUT_DNA_nodup.bw +7. Load the three bigwig files in IGV + * File / Load from File... + * Select the three bigwig files. +8. Set the visualization of the three bigwig files to be autoscaled + * Click right on the name of the tracks and select **Autoscale** + * Click right on the name of the tracks and set the windowing function to Maximum + +**Go back to the genes we looked at earlier: pepT, ycfP (add screenshots to this report). Look at the shape of the signal.** +**Keep IGV opened.** + +Go back to working home directory (i.e /shared/projects//EBAII2024_chipseq) +```{bash eval=FALSE} +## If you are in 04-Visualization +cd .. +``` + +# - Peak calling with MACS2 +**Goal**: Detect the peaks which are regions with high densities of reads and that correspond to where the studied factor was bound + +## - Choosing a peak-calling program +There are multiple programs to perform the peak-calling step. Some are more directed towards histone marks (broad peaks) while others are specific to transcription factors which present narrow peaks. Here we will use the callpeak function of MACS2 (version 2.2.7.1) because it's known to produce generally good results, and it is well-maintained by the developer. + +## - Calling the peaks +1. Create a directory named **05-PeakCalling** and one directory named **replicates** within to store peaks coordinates. +```{bash eval=FALSE} +mkdir 05-PeakCalling +mkdir 05-PeakCalling/replicates +``` +2. Go to the newly created directory replicates +```{bash eval=FALSE} +cd 05-PeakCalling/replicates +``` +3. Try out MACS2 +```{bash eval=FALSE} +## Load macs2 in your environment +module load macs2/2.2.7.1 +macs2 callpeak --help +``` +This prints the help of the program. + +4. Let's see the parameters of MACS before launching the mapping: + * ChIP-seq tag file (-t) is the name of our experiment (treatment) mapped read file FNR_IP_ChIP-seq_Anaerobic_A.bam + * ChIP-seq control file (-c) is the name of our input (control) mapped read file Anaerobic_INPUT_DNA.bam + * --format BAM indicates the input file are in BAM format. Other formats can be specified (SAM,BED...) + * --gsize Effective genome size: this is the size of the genome considered "usable" for peak calling. This value is given by the MACS developers on their website. It is smaller than the complete genome because many regions are excluded (telomeres, highly repeated regions...). The default value is for human (2700000000.0), so we need to change it. As the value for E. coli is not provided, we will take the complete genome size 4639675. + * --name provides a prefix for the output files. We set this to FNR_Anaerobic_A, but it could be any name. + * --bw The bandwidth is the size of the fragment extracted from the gel electrophoresis or expected from sonication. By default, this value is 300bp. Usually, this value is indicated in the Methods section of publications. In the studied publication, a sentence mentions "400bp fragments (FNR libraries)". We thus set this value to 400. + * --fix-bimodal indicates that in the case where macs2 cannot find enough paired peaks between the plus strand and minus strand to build the shifting model, it can bypass this step and use a extension size of 200bp by default. + * -p 1e-2 indicates that we report the peaks if their associated p-value is lower than 1e-2. This is a relaxed threshold as we want to keep a high number of false positives in our peak set to later compute the IDR analysis. + * &> MACS.out will output the verbosity (=information) in the file MACS.out +```{bash eval=FALSE} +macs2 callpeak \ + -t ../../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam \ + -c ../../02-Mapping/bam/Anaerobic_INPUT_DNA.bam \ + --format BAM \ + --gsize 4639675 \ + --name 'FNR_Anaerobic_A' \ + --bw 400 \ + --fix-bimodal \ + -p 1e-2 \ + &> repA_MACS.out +``` +5. Run macs2 for replicate A and replicate B. + +6. In a new directory called pool, run macs2 for the pooled replicates A and B by giving both bam files as input treatment files (-t). +```{bash eval=FALSE} +# You should be in 05-PeakCalling +cd .. +mkdir pool +cd pool + +# Run macs2 for pooled replicates +macs2 callpeak \ + -t ../../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam \ + ../../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam \ + -c ../../02-Mapping/bam/Anaerobic_INPUT_DNA.bam \ + --format BAM \ + --gsize 4639675 \ + --name 'FNR_Anaerobic_pool' \ + --bw 400 \ + --fix-bimodal \ + -p 1e-2 \ + &> pool_MACS.out +``` + +## - Analyzing MACS results +**Look at the files that were created by MACS. Explain the content of the result files ?** +**How many peaks were detected by MACS2 for each sample and in the pool of samples ?** + +## - Calling peaks in a replicate-aware method (IDR) +In order to take advantage of having biological replicates, we will create a combine set of peaks based on the reproducibility of each individual replicate peak calling. We will use the **Irreproducible Discovery Rate** (IDR) algorithm. + +1. Create a new directory to store the peak coordinates resulting after idr analysis +```{bash eval=FALSE} +## You should be 05-PeakCalling +cd .. +mkdir idr +cd idr +``` +Your directory structure should be like this: +``` +/shared/projects//EBAII2024_chipseq +│ +└───data +│ +└───01-QualityControl +│ +└───02-Mapping +| └───index +| └───bam +│ +└───03-ChIPQualityControls +│ +└───04-Visualization +| +└───05-PeakCalling +| └───replicates +| └───pool +| └───idr <- you should be in this folder +``` + +2. Load the module idr and have a look at its parameters +```{bash eval=FALSE} +## Load idr in your environment +module load idr/2.0.4.2 +idr --help +``` +* --samples : peak files of each individual replicate +* --peak-list : the peak file of the pooled replicates, it will be used as a master peak set to compare with the regions from each replicates +* --input-file-type : format of the peak file, in our case it is narrowPeak +* --output-file : name of the result file +* --plot : plot additional diagnosis plot + +3. Run idr +```{bash eval=FALSE} +idr \ + --samples ../replicates/FNR_Anaerobic_A_peaks.narrowPeak \ + ../replicates/FNR_Anaerobic_B_peaks.narrowPeak \ + --peak-list ../pool/FNR_Anaerobic_pool_peaks.narrowPeak \ + --input-file-type narrowPeak \ + --output-file FNR_anaerobic_idr_peaks.bed \ + --plot +``` + +**Add the IDR graph to this report. How many peaks are found with the IDR method?** + +4. Remove IDR and MACS2 from your environment and go back to working home directory (i.e /shared/projects//EBAII2024_chipseq) +```{bash eval=FALSE} +module unload macs2/2.2.7.1 +module unload idr/2.0.4.2 + +## If you are in 05-PeakCalling/idr +cd ../.. +``` + +## - Visualize peaks into IGV + +1. Download the following BED files from the server into your computer to visualise in IGV : +* 05-PeakCalling/replicates/FNR_Anaerobic_A_peaks.narrowPeak +* 05-PeakCalling/replicates/FNR_Anaerobic_B_peaks.narrowPeak +* 05-PeakCalling/pool/FNR_Anaerobic_pool_peaks.narrowPeak +* 05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed + +**Go back again to the genes we looked at earlier: pepT, ycfP. Do you see peaks (add the 2 screenshots to this report)?** +**Navigate throught the genome to find peaks detected in the replicates (peak calling per replicate) and not found/kept with the IDR method** + +**From now on, peak set we keep is the IDR peak set.** + +# - Motif analysis +**Goal**: Define binding motif(s) for the ChIPed transcription factor and identify potential cofactors + +## - Retrieve the peak sequences corresponding to the peak coordinate file (BED) + +For the motif analysis, you first need to extract the sequences corresponding to the peaks. There are several ways to do this (as usual...). If you work on a UCSC-supported organism, the easiest is to use RSAT fetch-sequences or Galaxy. Here, we will use Bedtools [@bedtools], as we have the genome of interest on our computer (Escherichia_coli_K12.fasta). +1. Create a directory named **06-MotifAnalysis** to store data needed for motif analysis +```{bash eval=FALSE} +mkdir 06-MotifAnalysis +``` +2. Go to the newly created directory +```{bash eval=FALSE} +cd 06-MotifAnalysis +``` + +Your directory structure should be like this: +``` +/shared/projects//EBAII2024_chipseq +│ +└───data +│ +└───01-QualityControl +│ +└───02-Mapping +| └───index +| └───bam +│ +└───03-ChIPQualityControls +│ +└───04-Visualization +│ +└───05-PeakCalling +│ +└───06-MotifAnalysis <- you should be in this folder +``` + +3. Extract peak sequence in fasta format +```{bash eval=FALSE} +## First load samtools +module load samtools/1.18 +## Create an index of the genome fasta file +samtools faidx ../data/Escherichia_coli_K12.fasta + +## First load bedtools +module load bedtools/2.30.0 +## Extract fasta sequence from genomic coordinate of peaks +bedtools getfasta \ + -fi ../data/Escherichia_coli_K12.fasta \ + -bed ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed \ + -fo FNR_anaerobic_idr_peaks.fa +``` +4. Download the file FNR_anaerobic_idr_peaks.fa on your computer + +## - Motif discovery with RSAT +1. Open a connection to a Regulatory Sequence Analysis Tools server. You can choose between various website mirrors. + * Teaching Server (recommended for this training) [https://rsat.france-bioinformatique.fr/teaching/](https://rsat.france-bioinformatique.fr/teaching/) +2. In the left menu, click on **NGS ChIP-seq** and then click on **peak-motifs**. A new page opens, with a form +3. The default peak-motifs web form only displays the essential options. There are only two mandatory parameters. + * The **title box**, which you will set as **FNR Anaerobic** . The **sequences**, that you will **upload from your computer**, by clicking on the button Choose file, and select the file **FNR_anaerobic_idr_peaks.fa** from your computer. +4. We will now modify some of the advanced options in order to fine-tune the analysis according to your data set. + * Open the "Reduce peak sequences" title, and make sure the **Cut peak sequences: +/- ** option is set to **0** (we wish to analyze our full dataset) + * Open the “Motif Discovery parameters” title, and check the **oligomer sizes 6 and 7** (but not 8). Check "Discover over-represented spaced word pairs **[dyad-analysis]**" + * Under “Compare discovered motifs with databases”, **add RegulonDB prokaryotes** (2015_08) as the studied organism is the bacteria E. coli. +5. Click “**GO**”. +6. The Web page displays a link, You can already click on this link. The report will be progressively updated during the processing of the workflow. + +**Is there anything interesting in RSAT results? If so, which motif is of interest and why (add screenshot of the results).** + +# - Peak annotation + +**Goals**: Associate ChIP-seq peaks to genomic features, identify closest genes and run ontology analyses + +1. Create a directory named **07-PeakAnnotation** +```{bash eval=FALSE} +# aller dans le répertoire si besoin +cd .. + +mkdir 07-PeakAnnotation +``` +2. Go to the newly created directory +```{bash eval=FALSE} +cd 07-PeakAnnotation +``` + +## - Associate peaks to closest genes + +[annotatePeaks.pl](http://homer.ucsd.edu/homer/ngs/annotation.html) from the Homer suite [@heinz_simple_2010] associates peaks with nearby genes. + +1. Create a file suitable for annotatePeaks.pl. To run the tool needs a peak bed file composed of 6 fields (chr, start, end, name, score, strand). The 5 first columns of the file ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed are good but all other colums are of no use and the strand is missing. To generate a file with a correct format, we are using the tool cut to select fields 1 to 5 of the peak file and we add a "+" to every line using awk (*this is code example that can do what we want, not the only solution to do so.*). +```{bash eval=FALSE} +cut \ + -f 1-5 \ + ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed | \ + awk -F "\t" '{print $0"\t+"}' \ + > FNR_anaerobic_idr_peaks.bed +``` +2. Try annotatePeaks.pl +```{bash eval=FALSE} +## First load bedtools +module load homer/4.11 + +## run Homer annotatePeaks +annotatePeaks.pl --help +``` +Let's see the parameters: + +annotatePeaks.pl peak/BEDfile genome > outputfile + User defined annotation files (default is UCSC refGene annotation): + annotatePeaks.pl accepts GTF (gene transfer formatted) files to annotate positions relative + to custom annotations, such as those from de novo transcript discovery or Gencode. + + -gtf (Use -gff and -gff3 if appropriate, but GTF is better) + + +3. Annotation peaks with nearby genes with Homer +```{bash eval=FALSE} +annotatePeaks.pl \ + FNR_anaerobic_idr_peaks.bed \ + ../data/Escherichia_coli_K12.fasta \ + -gtf ../data/Escherichia_coli_K_12_MG1655.annotation.fixed.gtf \ + > FNR_anaerobic_idr_annotated_peaks.tsv +``` + +**Look at the file you generated. Gene symbols are not present. Let's add them with some R code.** + +4. Launch Rstudio in [ondemand](https://ondemand.cluster.france-bioinformatique.fr/) +![alt text][launchrstudio] + +5. Add gene symbol annotation using R with Rstudio +```{R eval=FALSE, include=TRUE} + +## set working directory +setwd("/shared/projects//EBAII2024_chipseq/07-PeakAnnotation") +## Or navigate using the "Files" tab and click on "More">"Set as Working Directory" + +## read the file with peaks annotated with homer +## data are loaded into a data frame +## sep="\t": this is a tab separated file +## header=TRUE: there is a line with headers (ie. column names) +d <- read.table("FNR_anaerobic_idr_annotated_peaks.tsv", sep="\t", header=TRUE) + +## Load a 2-columns files which contains in the first column gene IDs +## and in the second column gene symbols +## data are loaded into a data frame +## header=FALSE: there is no header line +gene.symbol <- read.table("../data/Escherichia_coli_K_12_MG1655.annotation.tsv.gz", header=FALSE) + +## Merge the 2 data frames based on a common field +## by.x gives the columns name in which the common field is for the d data frame +## by.y gives the columns name in which the common field is for the gene.symbol data frame +## d contains several columns with no information. We select only interesting columns +d.annot <- merge(d[,c(1,2,3,4,5,6,8,10,11)], gene.symbol, by.x="Nearest.PromoterID", by.y="V1") + +## Change column names of the resulting data frame +colnames(d.annot)[2] <- "PeakID" # name the 2d column of the new file "PeakID" +colnames(d.annot)[dim(d.annot)[2]] <- "Gene.Symbol" + +## output the merged data frame to a file named "FNR_anaerobic_idr_final_peaks_annotation.tsv" +## col.names=TRUE: output column names +## row.names=FALSE: don't output row names +## sep="\t": table fields are separated by tabs +## quote=FALSE: don't put quote around text. +write.table(d.annot, "FNR_anaerobic_idr_final_peaks_annotation.tsv", col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) + +``` + +**What information is listed in each column of the file? (print column names and explain them)** + +**How many genes are associated to the "promoter-TSS" feature?** + +**What are all the possible gene features? (see in column Annotation - extract information like promoter-TSS, TSS, ...). Create a plot (pie chart, barplot...) showing the proportion of each of them (include both the plot and the code that created it in the report).** + +6. Go back to working home directory (i.e /shared/projects/training/\/M2.2-BIMS-epigenomique) +```{bash eval=FALSE} +## If you are in 07-PeakAnnotation +cd .. +``` + +## - Search for Biological Processes, Molecular Functions or Cellular Compartments enrichment +Use Official gene symbols of the file FNR_anaerobic_idr_final_peaks_annotation.tsv to search for enriched gene ontologies with the tool DAVID (Database for Annotation, Visualization and Integrated Discovery). Input your gene list on the DAVID website: https://david.ncifcrf.gov/. **Use DAVID convert ID tool if needed** + +**Are there biological processes enriched in the list of genes associated to the peaks? Show the top results of the Functional Annotation Clustering.** +**Are these genes enriched in some KEGG pathway? Which ones?** + +# - Bonus: Annotation of ChIP-peaks using R tools + +**In this part, we will use a different set of peaks obtained using a peak caller from a set of p300 ChIP-seq experiments in different mouse embryonic tissues (midbrain, forebrain and limb).** + +## - Obtain the bed files from GEO + +1. We will download the already called peak files in bed format from GEO. +Create a new folder and go in it. +```{bash eval=FALSE} +cd /shared/projects//EBAII2024_chipseq +mkdir 07-PeakAnnotation-bonus +cd 07-PeakAnnotation-bonus +``` +2. Search for the dataset **GSE13845** either using Google or from the front page of [GEO](https://www.ncbi.nlm.nih.gov/geo/) +3. On the description page, find the three GSM files, and click on each of then +4. On each page, select and download the `GSMxxxxx_p300_peaks.txt.gz` file to the newly created folder (where `xxxxx` represents the GSM number) +You should now have downloaded 3 files: +> GSM348064_p300_peaks.txt.gz (Forebrain) +> GSM348065_p300_peaks.txt.gz (Midbrain) +> GSM348066_p300_peaks.txt.gz (limb) + +```{bash eval=TRUE, include=FALSE} +mkdir 07-PeakAnnotation-bonus +cd 07-PeakAnnotation-bonus +curl -O https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM348nnn/GSM348064/suppl/GSM348064_p300_peaks.txt.gz +curl -O https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM348nnn/GSM348065/suppl/GSM348065_p300_peaks.txt.gz +curl -O https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM348nnn/GSM348066/suppl/GSM348066_p300_peaks.txt.gz +``` + +*Beware: Make sure to check which genome version was used to call the peaks (remember: this is mouse data!)* + +## - Performing a first evaluation of peak sets using R + +Now, we will use **RStudio** to perform the rest of the analysis in R. For the analysis, we will need some R/Bioconductor libraries + + * [ChIPSeeker](https://bioconductor.org/packages/release/bioc/html/ChIPseeker.html) [@chipseeker] + * [mouse gene annotation](http://bioconductor.org/packages/release/data/annotation/html/TxDb.Mmusculus.UCSC.mm9.knownGene.html) + * [mouse functional annotation](http://bioconductor.org/packages/release/data/annotation/html/org.Mm.eg.db.html) + * [clusterProfiler: Gene set annotation tool](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) [@clusterprofiler] + +1. Go to Rstudio and execute the R code below (show results in the report) +```{r eval=TRUE, message=FALSE, warning=FALSE, include=TRUE} +# load the required libraries +library(RColorBrewer) +library(ChIPseeker) +library(TxDb.Mmusculus.UCSC.mm9.knownGene) +library(org.Mm.eg.db) +# define the annotation of the mouse genome +txdb = TxDb.Mmusculus.UCSC.mm9.knownGene +# define colors +col = brewer.pal(9,'Set1') +``` + +2. read the peak files for the three datasets: + +```{r eval=FALSE, include=TRUE} +# set the working directory to the folder in which the peaks are stored +setwd("/shared/projects//EBAII2024_chipseq/07-PeakAnnotation-bonus") +# read the peaks for each dataset +peaks.forebrain = readPeakFile('GSM348064_p300_peaks.txt.gz') +peaks.midbrain = readPeakFile('GSM348065_p300_peaks.txt.gz') +peaks.limb = readPeakFile('GSM348066_p300_peaks.txt.gz') +``` + +```{r eval=TRUE, message=FALSE, warning=FALSE, include=FALSE} +peaks.forebrain = readPeakFile('07-PeakAnnotation-bonus/GSM348064_p300_peaks.txt.gz') +peaks.midbrain = readPeakFile('07-PeakAnnotation-bonus/GSM348065_p300_peaks.txt.gz') +peaks.limb = readPeakFile('07-PeakAnnotation-bonus/GSM348066_p300_peaks.txt.gz') +``` + +```{r eval=TRUE, include=TRUE} +# create a list containing all the peak sets +all.peaks = list(forebrain=peaks.forebrain, +midbrain=peaks.midbrain, +limb=peaks.limb) +``` + +The peaks are stored as **GenomicRanges** object; this is an R format which look like the bed format, but is optimized in terms of memory requirements and speed of execution. + +We can start by computing some basic statistics on the peak sets. + +### - How many peaks? + +```{r eval=TRUE, include=TRUE} +# check the number of peaks for the forebrain dataset +length(peaks.forebrain) + +# compute the number of peaks for all datasets using the list object +sapply(all.peaks,length) + +# display this as a barplot +barplot(sapply(all.peaks,length),col=col) +``` + +### - How large are these peaks? +```{r eval=TRUE, include=TRUE} +# statistics on the peak length for forebrain +summary(width(peaks.forebrain)) + +# size distribution of the peaks +peaks.width = lapply(all.peaks,width) +lapply(peaks.width,summary) + +# boxplot of the sizes +boxplot(peaks.width,col=col) +``` + +### - What is the score of these peaks? + +Can you adapt the previous code to display a boxplot of the peak score distribution for the Forebrain peak set (column `Maximum.Peak.Height`)? + +### - Where are the peaks located? + +We can now display the genomic distribution of the peaks along the chromosomes, including the peak scores, using the `covplot` function from `ChIPSeeker`: +```{r eval=TRUE, include=TRUE} +# genome wide distribution +covplot(peaks.forebrain, weightCol="Maximum.Peak.Height") +``` + +**Exercice: use the option "lower" in covplot to display only the peaks with a score (Max.Peak.Height) above 10** + +### - How does the signal look like at TSS? + +In addition to the genome wide plot, we can check if there is a tendency for the peaks to be located close to gene promoters. +```{r eval=TRUE, include=TRUE} +# define gene promoters +promoter = getPromoters(TxDb=txdb, upstream=5000, downstream=5000) + +# compute the density of peaks within the promoter regions +tagMatrix = getTagMatrix(peaks.limb, windows=promoter) + +# plot the density +tagHeatmap(tagMatrix, palette = "RdYlBu") +``` + +## - Functional annotation of the peaks + +We can now assign the peaks to the closest genes and genomic compartments (introns, exons, promoters, distal regions, etc...) +This is done using the function `annotatePeak` which compares the peak files with the annotation file of the mouse genome. This function returns +a complex object which contains all this information. + +```{r eval=TRUE, include=TRUE} +peakAnno.forebrain = annotatePeak(peaks.forebrain, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db") +peakAnno.midbrain = annotatePeak(peaks.midbrain, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db") +peakAnno.limb = annotatePeak(peaks.limb, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db") +``` + +### - genomic localization + +We can now analyze more in details the localization of the peaks (introns, exons, promoters, distal regions,...) + +```{r eval=TRUE, include=TRUE} +# distribution of genomic compartments for forebrain peaks +plotAnnoPie(peakAnno.forebrain) + +# for all the peaks +plotAnnoBar(list(forebrain=peakAnno.forebrain, midbrain=peakAnno.midbrain,limb=peakAnno.limb)) +``` + +**Question: do you see differences between the three peak sets?** + + +### - functional annotation + +An important step in ChIP-seq analysis is to interpret genes that are located close to the ChIP peaks. Hence, we need to +1. assign genes to peaks +2. compute functional enrichments of the target genes. + +**Beware:** +By doing so, we assume that the target gene of the peak is always the closest one. Hi-C/4C analysis have shown that in higher eukaryotes, this is not always the case. However, in the absence of data on the real target gene of ChIP-peaks, we can work with this approximation. + +We will compute the enrichment of the Gene Ontology "Biological Process" categories in the set of putative target genes. + +```{r eval=TRUE, message=FALSE, warning=FALSE, include=TRUE} +# load the library +library(clusterProfiler) +``` +```{r eval=TRUE, include=TRUE} +# define the list of all mouse genes as a universe for the enrichment analysis +universe = mappedkeys(org.Mm.egACCNUM) + +## extract the gene IDs of the forebrain target genes +genes.forebrain = peakAnno.forebrain@anno$geneId +ego.forebrain = enrichGO(gene = genes.forebrain, + universe = universe, + OrgDb = org.Mm.eg.db, + ont = "BP", + pAdjustMethod = "BH", + pvalueCutoff = 0.01, + qvalueCutoff = 0.05, + readable = TRUE) + +# display the results as barplots +barplot(ego.forebrain,showCategory=10) +``` + +**Question: do you see an enrichment of the expected categories? What does the x-axis mean? What does the color mean?** + +**Exercise:** redo this analysis for the limb dataset and check if the enriched categories make sense. + + +## FAQ +### How to download the data +**Goal**: Identify the datasets corresponding to the studied article and retrieve the data (reads as FASTQ files) corresponding to 2 replicates of a condition and the corresponding control. + +#### - Obtaining an identifier for a chosen dataset +NGS datasets are (usually) made freely accessible for other scientists, by depositing these datasets into specialized databanks. [Sequence Read Archive (SRA)](http://www.ncbi.nlm.nih.gov/sra) located in USA hosted by NCBI, and its European equivalent [European Nucleotide Archive (ENA)](http://www.ebi.ac.uk/ena) located in England hosted by EBI both contains **raw reads**. + +Functional genomic datasets (transcriptomics, genome-wide binding such as ChIP-seq,...) are deposited in the databases [Gene Expression Omnibus (GEO)](http://www.ncbi.nlm.nih.gov/geo/) or its European equivalent [ArrayExpress](https://www.ebi.ac.uk/arrayexpress/). + +Within an article of interest, search for a sentence mentioning the deposition of the data in a database. Here, the following sentence can be found at the end of the Materials and Methods section: +*"All genome-wide data from this publication have been deposited in NCBI’s Gene Expression Omnibus (**GSE41195**)."* +We will thus use the **GSE41195** identifier to retrieve the dataset from the **NCBI GEO** (Gene Expression Omnibus) database. + +#### - Accessing GSE41195 from GEO +1. The GEO database hosts processed data files and many details related to the experiments. SRA (Sequence Read Archive) stores the actual raw sequence data. +2. Search in Google **GSE41195**. Click on the first link to directly access the correct page on the GEO database. +![alt text][geo] +3. This GEO entry is a mixture of expression analysis (Nimblegen Gene Expression Array), chip-chip and chip-seq. At the bottom of the page, click on the subseries related to the chip-seq datasets. (this subseries has its own identifier: **GSE41187**). +![alt text][geo2] +4. From this page, we will focus on the experiment **FNR IP ChIP-seq Anaerobic A**. At the bottom of the page, click on the link "**GSM1010219** - FNR IP ChIP-seq Anaerobic A". +5. In the new page, go to the bottom to find the SRA identifier. This is the identifier of the raw dataset stored in the SRA database. +![alt text][geo3] +6. Copy the identifier **SRX189773** (do not click on the link that would take you to the SRA database, see below why) + +#### - Downloading FASTQ file from the ENA database +Although direct access to the SRA database at the NCBI is doable, SRA does not store sequences in a FASTQ format. So, in practice, it's simpler (and quicker!!) to download datasets from the ENA database (European Nucleotide Archive) hosted by EBI (European Bioinformatics Institute) in UK. ENA encompasses the data from SRA. + +1. Go to the [EBI](http://www.ebi.ac.uk/) website. Paste your SRA identifier (SRX189773) and click on the button "search". +![alt text][ebi4] +2. Click on the first result. On the next page, there is a link to the FASTQ file. For efficiency, this file has already been downloaded and is available in the "data" folder (FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz) +![alt text][ebi5] + +**tip**: To download the replicate and control datasets, we should redo the same steps starting from the GEO web page specific to the chip-seq datasets (see step 2.4) and choose **FNR IP ChIP-seq Anaerobic B** and **anaerobic INPUT DNA**. Downloaded FASTQ files are available in the data folder (FNR_IP_ChIP-seq_Anaerobic_B.fastq.gz and Anaerobic_INPUT_DNA.fastq.gz respectively) + +**At this point, you have three FASTQ files, two IPs, one control (INPUT).** + +### How to extract peaks from the supplementary data of a publication ? +The processed peaks (BED file) is sometimes available on the GEO website, or in supplementary data. Unfortunately, most of the time, the peak coordinates are embedded into supplementary tables and thus not usable "as is". +This is the case for the studied article. To be able to use these peaks (visualize them in a genome browser, compare them with the peaks found with another program, perform downstream analyses...), you will need to (re)-create a BED file from the information available. +Here, Table S5 provides the coordinates of the summit of the peaks. The coordinates are for the same assembly as we used. + +1. copy/paste the first column into a new file, and save it as retained_peaks.txt +2. use a PERL command (or awk if you know this language) to create a BED-formatted file. As we need start and end coordinates, we will arbitrarily take +/-50bp around the summit. +```{bash eval=FALSE} +perl -lane 'print "gi|49175990|ref|NC_000913.2|\t".($F[0]-50)."\t".($F[0]+50)."\t" ' retained_peaks.txt > retained_peaks.bed +``` +3. The BED file looks like this: +> gi|49175990|ref|NC_000913.2| 120 220 +> gi|49175990|ref|NC_000913.2| 20536 20636 +> gi|49175990|ref|NC_000913.2| 29565 29665 +> gi|49175990|ref|NC_000913.2| 34015 34115 +4. Depending on the available information, the command will be different. + +### - How to obtain the annotation (=Gene) GTF file for IGV? +Annotation files can be found on genome websites, NCBI FTP server, Ensembl, ... However, IGV required GFF format, or BED format, which are often not directly available. +Here, I downloaded the annotation from the [UCSC Table browser](http://microbes.ucsc.edu/cgi-bin/hgTables?org=Escherichia+coli+K12&db=eschColi_K12&hgsid=1465191&hgta_doMainPage=1) as "Escherichia_coli_K_12_MG1655.annotation.gtf". Then, I changed the "chr" to the name of our genome with the following PERL command: + +```{bash eval=FALSE} +perl -pe 's/^chr/gi\|49175990\|ref\|NC_000913.2\|/' Escherichia_coli_K_12_MG1655.annotation.gtf > Escherichia_coli_K_12_MG1655.annotation.fixed.gtf +``` +This file will work directly in IGV + +# References + +[geo]: ../images/1_GEO.png "GEO" +[geo2]: ../images/2_GEO.png "GEO2" +[geo3]: ../images/3_GEO.png "GEO3" +[ebi4]: ../images/4_EBI.png "EBI" +[ebi5]: ../images/5_EBI.png "EBI" +[genome6]: ../images/6_Genomes.png "E. Coli K-12" +[selectjupyterlab]: ../images/selectJupyterLab.png "Select Jupyterlab" +[jupyterlabform]: ../images/jupyterlabForm.png "Jupyerlab Form" +[launchjupyterlab]: ../images/launchJupyterhub.png "launch Jupyterlab" +[fastqc]: ../images/fastqc.png "Jupyterlab FastQC" +[igvbam]: ../images/IGVbam.png "IGV BAM files" +[launchrstudio]: ../images/launchRstudio.png "Launch Rstudio" diff --git a/2024/ebaiin1/chip-seq/hands-on/hands-on.html b/2024/ebaiin1/chip-seq/hands-on/hands-on.html new file mode 100644 index 0000000..c1e3314 --- /dev/null +++ b/2024/ebaiin1/chip-seq/hands-on/hands-on.html @@ -0,0 +1,3155 @@ + + + + + + + + + + + + + + +ChIP-seq workshop - Hands-on Roscoff 2024 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
+

1 - Introduction

+
+

1.1 - Goal

+

The aim is to :

+
    +
  • understand the nature of ChIP-Seq data
  • +
  • perform a complete analysis workflow including quality check (QC), +read mapping, visualization in a genome browser and peak-calling. Use +command line and open source software for each step of the workflow and +feel the complexity of the task
  • +
  • have an overview of some possible downstream analyses
  • +
  • perform a motif analysis with online web programs
  • +
+
+
+

1.2 - Summary

+

This training gives an introduction to ChIP-seq data analysis, +covering the processing steps starting from the reads to the peaks. +Among all possible downstream analyses, the practical aspect will focus +on motif analyses. A particular emphasis will be put on deciding which +downstream analyses to perform depending on the biological question. +This training does not cover all methods available today. It does not +aim at bringing users to a professional NGS analyst level but provides +enough information to allow biologists understand what is DNA sequencing +in practice and to communicate with NGS experts for more in-depth +needs.

+
+
+

1.3 - Dataset +description

+

For this training, we will use two datasets:

+
    +
  • a dataset produced by Myers et al Pubmed involved +in the regulation of gene expression under anaerobic conditions in +bacteria. We will focus on one factor: FNR. The +advantage of this dataset is its small size, allowing real time +execution of all steps of the dataset
  • +
  • a dataset of ChIP-seq peaks obtained in different mouse tissues for +the p300 co-activator protein by Visel et al. Pubmed; +we will use this dataset to illustrate downstream annotation of peaks +using R.
  • +
+
+
+
+

2 - Downloading ChIP-seq +reads from NCBI

+

Goal: Identify the datasets corresponding to the +studied article and retrieve the data (reads as FASTQ files) +corresponding to 2 replicates of a condition and the corresponding +control.

+
+

2.1 - Obtaining an +identifier for a chosen dataset

+

NGS datasets are (usually) made freely accessible for other +scientists, by depositing these datasets into specialized databanks. Sequence Read Archive (SRA) +located in USA hosted by NCBI, and its European equivalent European Nucleotide Archive (ENA) +located in England hosted by EBI both contains raw +reads.

+

Functional genomic datasets (transcriptomics, genome-wide binding +such as ChIP-seq,…) are deposited in the databases Gene Expression Omnibus +(GEO) or its European equivalent ArrayExpress.

+

Within an article of interest, search for a sentence mentioning the +deposition of the data in a database. Here, the following sentence can +be found at the end of the Materials and Methods section: “All +genome-wide data from this publication have been deposited in NCBI’s +Gene Expression Omnibus (GSE41195).” We will thus +use the GSE41195 identifier to retrieve the dataset +from the NCBI GEO (Gene Expression Omnibus) +database.

+
+
+

2.2 - Accessing GSE41195 +from GEO

+
    +
  1. The GEO database hosts processed data files and many details related +to the experiments. SRA (Sequence Read Archive) stores the actual raw +sequence data.
  2. +
  3. Search in Google GSE41195. Click on the first link +to directly access the correct page on the GEO database. alt text
  4. +
  5. This GEO entry is a mixture of expression analysis (Nimblegen Gene +Expression Array), chip-chip and chip-seq. At the bottom of the page, +click on the subseries related to the chip-seq datasets. (this subseries +has its own identifier: GSE41187). alt text
  6. +
  7. From this page, we will focus on the experiment FNR IP +ChIP-seq Anaerobic A. At the bottom of the page, click on the +link “GSM1010219 - FNR IP ChIP-seq Anaerobic A”.
  8. +
  9. In the new page, go to the bottom to find the SRA identifier. This +is the identifier of the raw dataset stored in the SRA database.
    +alt text
  10. +
  11. Click on the identifier SRX189773
  12. +
+
+
+

2.3 - Downloading FASTQ +file from the SRA database

+

SRA stores sequences in a FASTQ format.

+
    +
  1. Click on SRR576933 in SRA ![alt text][sra1]
  2. +
  3. There are statistics on the run that generated the data. ![alt +text][sra2]
  4. +
  5. Click on FASTA/FASTQ download. On the next page, there is a link to +the FASTQ file. For efficiency, this file has already been downloaded +and is available in the “data” folder (SRR576933.fastq.gz)
    +![alt text][sra3]
  6. +
+

tip: To download the replicate and control datasets, +we should redo the same steps starting from the GEO web page specific to +the chip-seq datasets (see step 2.4) and choose FNR IP ChIP-seq +Anaerobic B and anaerobic INPUT DNA. +Downloaded FASTQ files are available in the data folder +(SRR576934.fastq.gz and SRR576938.fastq.gz respectively)

+

At this point, you have three FASTQ files, two IPs, one +control (INPUT).

+
+
+
+

3 - Connect to the server +and set up your environment

+

During this training, we will work on the cluster provided by the +Institut Français de Bioinformatique (IFB) using JupyterLab through the +ondemand system.

+
    +
  1. Go to ondemand

  2. +
  3. Select JupyterLab: Core alt text

  4. +
  5. Fill the form as such:

  6. +
+
    +
  • account: 2422_ebaii_n1,
  • +
  • CPUS: 2
  • +
  • Amount of memory: 10G
  • +
  • Number of hours: 7 alt text
  • +
+
    +
  1. Once the job is running, click on Connect to Jupyter alt text
  2. +
+
+

3.1 - Set up your working +environment

+
    +
  1. Go to your project directory
  2. +
+
cd /shared/projects/<your_project>
+
    +
  1. Create a directory that will contain all results of the upcoming +analyses.
  2. +
+
mkdir EBAII2024_chipseq
+
    +
  1. Go to the newly created directory
  2. +
+
cd EBAII2024_chipseq
+
    +
  1. Copy the directory containing data
  2. +
+
cp -r /shared/projects/2422_ebaii_n1/chipseq/EBAII2024_chipseq/data .
+
    +
  1. Your directory structure should be like this
  2. +
+
/shared/projects/<your_project>/EBAII2024_chipseq
+│
+└───data
+

If you wish, you can check your directory structure:

+
 tree
+
+
+
+

4 - Quality control of +the reads and statistics

+

Goal: Get some basic information on the data (read +length, number of reads, global quality of datasets)

+
+

4.1 - Generating the +FASTQC report

+

Before you analyze the data, it is crucial to check the quality of +the data. We will use the standard tool for checking the quality of data +generated on the Illumina platform: FASTQC.

+
    +
  1. Create a directory named 01-QualityControl in which +to output results from fastqc
  2. +
+
mkdir 01-QualityControl
+
    +
  1. Go to the directory you’ve just created
  2. +
+
cd 01-QualityControl
+

Your directory structure should be like this

+
/shared/projects/<your_project>/EBAII2024_chipseq
+│
+└───data
+│   
+└───01-QualityControl <- you should be in this folder
+
    +
  1. Get FastQC available in your environment
  2. +
+
module load fastqc/0.12.1
+
    +
  1. Check the help page of the program to see its usage and +parameters.
  2. +
+
fastqc --help
+
    +
  1. Launch the FASTQC program on the experiment file +(FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz)
  2. +
+
    +
  • -o: creates all output files in the specified output directory. ‘.’ +means current directory.
  • +
+
fastqc ../data/FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz -o .
+
    +
  1. Wait until the analysis is finished. Check the FastQC result +files.
  2. +
+
ls
+
+

FNR_IP_ChIP-seq_Anaerobic_A_fastqc.html +FNR_IP_ChIP-seq_Anaerobic_A_fastqc.zip

+
+
    +
  1. Go to the directory +/shared/projects//EBAII2024_chipseq/1-QualityControl in +the tree directory on the left of the jupyterhub window and double click +on FNR_IP_ChIP-seq_Anaerobic_A_fastqc.html to visualize the file. alt text

  2. +
  3. Launch the FASTQC program on the replicate +(FNR_IP_ChIP-seq_Anaerobic_B.fastq.gz) and on the control file +(Anaerobic_INPUT_DNA.fastq.gz)

  4. +
+

Analyze the result of the FASTQC program:

+
    +
  • How many reads are present in each file ?
  • +
  • What is the read length ?
  • +
  • Is the overall quality good for the three samples ? +
  • +
  • Are there any concerns raised by the report ? If so, can you +tell where the problem might come from ?
  • +
+
    +
  1. Once you are done with FastQC, unload it
  2. +
+
module unload fastqc/0.11.9
+
+
+
+

5 - Mapping the reads +with Bowtie

+

Goal: Obtain the coordinates of each read to the +reference genome.

+
+

5.1 - Choosing a mapping +program

+

There are multiple programs to perform the mapping step. For reads +produced by an Illumina machine for ChIP-seq, the currently “standard” +programs is Bowtie (versions 1 and 2)(Langmead et +al. 2009) (Langmead and Salzberg +2012). We will use Bowtie version 2.5.1 for this +exercise.

+
+
+

5.2 - Bowtie

+
    +
  1. Load Bowtie
  2. +
+
module load bowtie2/2.5.1
+
    +
  1. Try out bowtie
  2. +
+
bowtie2
+

This prints the help of the program. However, this is a bit difficult +to read ! If you need to know more about the program, it’s easier to +directly check out the manual on the website.

+
    +
  1. Bowtie needs the reference genome to align each read on it. The +genome needs to be in a specific format (=index) for bowtie to be able +to use it. Several pre-built indexes are available for download on +bowtie webpage, but our genome is not there. You will need to make this +index file.

  2. +
  3. Create a directory named 02-Mapping in which to +output mapping results

  4. +
+
cd ..
+mkdir 02-Mapping
+
    +
  1. Go to the directory you’ve just created
  2. +
+
cd 02-Mapping
+
+
+

5.3 - Prepare the index +file

+
    +
  1. To make the index file, you will need the complete genome, in +FASTA format. It has already been downloaded to gain time +(Escherichia_coli_K12.fasta in the course folder) (The genome was +downloaded from the NCBI).

  2. +
  3. Create a directory named index in which to +output bowtie indexes

  4. +
+
mkdir index
+
    +
  1. Go to the newly created directory
  2. +
+
cd index
+
    +
  1. Try out bowtie2-build
  2. +
+
bowtie2-build
+
    +
  1. Build the index for bowtie
  2. +
+
## Creating genome index : provide the path to the genome file and the name to give to the index (Escherichia_coli_K12)
+bowtie2-build ../../data/Escherichia_coli_K12.fasta Escherichia_coli_K12
+
    +
  1. Go back to upper directory i.e 02-Mapping
  2. +
+
cd ..
+
+
+

5.4 - Mapping the +samples

+
    +
  1. Create a directory named bam to put mapping +results
  2. +
+
mkdir bam
+
    +
  1. Go to the newly created directory bam
  2. +
+
cd bam
+

Your directory structure should be like this:

+
/shared/projects/<your_project>/EBAII2024_chipseq
+│
+└───data
+│   
+└───01-QualityControl
+│   
+└───02-Mapping
+|    └───index
+|    └───bam <- you should be here
+
    +
  1. Let’s see the parameters of bowtie before launching the +mapping:
  2. +
+
    +
  • -x to specify genome index prefix
  • +
  • -U to specify file with reads to be mapped
  • +
  • -3 will trim x base from the end of the read. As our last position +is of low quality, we’ll trim 1 base.
  • +
  • -S will output the result in SAM format
  • +
  • –mm allows many concurrent bowtie processes on the same computer to +share the same memory image of the index
  • +
  • 2> FNR_IP_ChIP-seq_Anaerobic_A.out will output some statistics +about the mapping in the file FNR_IP_ChIP-seq_Anaerobic_A.out
  • +
+
## Run alignment
+## Tip: first type bowtie command line then add quotes around and prefix it with "sbatch --cpus 10 --wrap="
+module load formation/2421
+
+sbatch -p fast -o FNR_IP_ChIP-seq_Anaerobic_A.mapping.out --cpus-per-task 10 --wrap="bowtie2 -p 10 --mm -3 1 -x ../index/Escherichia_coli_K12 -U ../../data/FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz -S FNR_IP_ChIP-seq_Anaerobic_A.sam"
+

This should take few minutes as we work with a small genome. For the +human genome, we would need either more time and more resources.

+

Analyze the result of the mapped reads:
+Open the file FNR_IP_ChIP-seq_Anaerobic_A.mapping.out (for example using +the less command), which contains some statistics about the +mapping. How many reads were mapped? How many multi-mapped reads were +originally present in the sample? To quit less press ‘q’

+

Bowtie output is a SAM file. The +SAM format corresponds to large text files, that can be compressed +(“zipped”) into a BAM format. The BAM files takes up to 4 time less disk +space and are usually sorted and indexed for fast access to the data it +contains. The index of a given .bam file is named +.bam.bai or .bai file. Some tools require to have the +index of the bam file to process it.

+
    +
  1. multimapped reads are given a very low mapping quality (below 10). +Remove reads which mapping quality is below 10, sort the sam file and +create a bam file using samtools (Li et al. +2009). samtools view is used to filter data based on mapping +quality and samtools sort is used to sort data based on genomic +coordinates.
  2. +
+
    +
  • -@: number of processors to use
  • +
  • -q: to set a threshold to the mapping quality
  • +
  • -b: to output a BAM file (it is a SAM file by default)
  • +
  • -o: to specify a output file name
  • +
+
## First load samtools
+module load samtools/1.18
+## Then run samtools
+samtools view -@ 2 -q 10 -b FNR_IP_ChIP-seq_Anaerobic_A.sam | samtools sort -@ 2 - -o FNR_IP_ChIP-seq_Anaerobic_A.bam 
+
    +
  1. Create an index for the bam file
  2. +
+
samtools index FNR_IP_ChIP-seq_Anaerobic_A.bam
+
    +
  1. Compress the .sam file (you could also delete the file)
  2. +
+
gzip FNR_IP_ChIP-seq_Anaerobic_A.sam
+
    +
  1. Once it’s done, unload the tools you used
  2. +
+
module unload samtools/1.18 bowtie2/2.5.1
+
+
+

5.5 - Map the second +replicate and the control

+
    +
  1. Repeat the steps above (3 -> 6 - Mapping) for the files +FNR_IP_ChIP-seq_Anaerobic_B.fastq.gz and Anaerobic_INPUT_DNA.fastq.gz in +the directory named “bam” within the directory +02-Mapping.
  2. +
+

Analyze the result of the mapped reads:
+How many reads were mapped for samples Anaerobic_INPUT_DNA and +FNR_IP_ChIP-seq_Anaerobic_B?

+
+
+
+

6 - Estimating the number +of duplicated reads

+

Goal: Duplicated reads i.e reads mapped at the same +positions in the genome are present in ChIP-seq results. They can arise +from several reasons including a biased amplification during the PCR +step of the library prep, DNA fragments coming from repetitive elements +of the genome, sequencing saturation or the same clusters read several +times on the flowcell (i.e optical duplicates). As analyzing ChIP-Seq +data consist in detecting signal enrichment, we can not keep duplicated +reads for subsequent analysis. So let’s detect them using Picard (“Picard Tools - By +Broad Institute n.d.).

+
    +
  1. Go to the directory with alignment files
  2. +
+
cd /shared/projects/<your_project>/EBAII2024_chipseq/02-Mapping/bam
+
    +
  1. Run Picard markDuplicates to mark duplicated reads (= reads mapping +at the exact same location on the genome)
  2. +
+
    +
  • CREATE_INDEX: Create .bai file for the result bam file with marked +duplicate reads
  • +
  • INPUT: input file name to mark for duplicate reads
  • +
  • OUTPUT: output file name
  • +
  • METRICS: file with duplicates marking statistics
  • +
  • VALIDATION_STRINGENCY: Validation stringency for all SAM files read +by picard.
  • +
+
## Load picard
+module load picard/2.23.5
+
+## Run picard
+picard MarkDuplicates \
+-CREATE_INDEX true \
+-INPUT FNR_IP_ChIP-seq_Anaerobic_A.bam \
+-OUTPUT Marked_FNR_IP_ChIP-seq_Anaerobic_A.bam \
+-METRICS_FILE metric
+
+

To determine the number of duplicated reads marked by Picard, we can +run the samtools flagstat command:

+
## Add samtools to your environment
+module load samtools/1.18
+## run samtools
+samtools flagstat Marked_FNR_IP_ChIP-seq_Anaerobic_A.bam
+

Run picard MarkDuplicates on the 2 other samples. How many +duplicates are found in each sample?

+

Go back to working home directory (i.e +/shared/projects//EBAII2024_chipseq/)

+
## Unload picard and samtools
+module unload samtools/1.18 picard/2.23.5
+## If you are in 02-Mapping/bam
+cd ../..
+
+
+

7 - ChIP quality +controls

+

Goal: This exercise aims at plotting the +Lorenz curve to assess the quality of the chIP.

+
+

7.1 - Plot the Lorenz +curve with Deeptools

+
    +
  1. Create a directory named 03-ChIPQualityControls in +which to put mapping results for IP
  2. +
+
mkdir 03-ChIPQualityControls
+
    +
  1. Go to the newly created directory
  2. +
+
cd 03-ChIPQualityControls
+
    +
  1. Run Deeptools plotFingerprint +(Ramírez et al. 2016) to draw the Lorenz +curve
  2. +
+
    +
  • -b: List of indexed BAM files
  • +
  • -plot: File name of the output figure (extension can be either +“png”, “eps”, “pdf” or “svg”)
  • +
  • –numberOfSamples: how many regions are used to plot the graph
  • +
  • -p: Number of processors to use (2 processors)
  • +
+
## Load deeptools in your environment
+module load deeptools/3.5.4
+## Run deeptools fingerprint
+plotFingerprint \
+  -p 2 \
+  --numberOfSamples 10000 \
+  -b ../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam \
+     ../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam \
+     ../02-Mapping/bam/Anaerobic_INPUT_DNA.bam \
+  -plot fingerprint_10000.png
+
    +
  1. If plotFingerprint takes to much time to run. Take the file that has +already been prepared for the training.
  2. +
+
cp /shared/home/slegras/2421_m22_bims/slegras/03-ChIPQualityControls/fingerprint.png .
+
    +
  1. Go find the file using the directory tree on the left of the +Jupyterlab panel and click on the fingerprint.png file to display it in +Jupyterlab.
  2. +
+

Look at the result files fingerprint.png (add the plot to +this report). Give an explanation of the curves?

+

Go back to the working home directory (i.e +/shared/projects/2421_m22_bims/<login>)

+
## Unload deepTools
+module unload deeptools/3.5.4
+## If you are in 03-ChIPQualityControls
+cd ..
+
+
+
+

8 - Visualizing the data +in a genome browser

+

Goal: Check whether the IP worked: visualize the +data in their genomic context.

+
+

8.1 - Choosing a genome +browser

+

There are several options for genome browsers, divided between the +local browsers (which you need to install on your computer, eg. IGV) and +the online genome browsers (eg. UCSC genome browser, Ensembl). We often +use both types, depending on the aim and the localization of the data. +If the data are on your computer, to prevent data transfer, it’s easier +to visualize the data locally (IGV). Note that if you’re working on a +non-model organism, the local viewer will be the only choice. If the aim +is to share the results with your collaborators, view many tracks in the +context of many existing annotations, then the online genome browsers +are more suitable.

+
+
+

8.2 - Viewing the raw +alignment data in IGV

+
    +
  1. Download the following files from the server onto your computer
  2. +
+
    +
  • data/Escherichia_coli_K12.fasta
  • +
  • data/Escherichia_coli_K_12_MG1655.annotation.fixed.gtf
  • +
  • 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam
  • +
  • 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam.bai
  • +
  • 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam
  • +
  • 02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam.bai
    +
  • +
  • 02-Mapping/bam/Anaerobic_INPUT_DNA.bam
  • +
  • 02-Mapping/bam/Anaerobic_INPUT_DNA.bam.bai
  • +
+
    +
  1. Open IGV on your computer
  2. +
  3. Load the genome
  4. +
+
    +
  • Genomes / Load Genome from File…
  • +
  • Select the fasta file Escherichia_coli_K12.fasta located into the +data directory
  • +
+
    +
  1. Load an annotation file named +Escherichia_coli_K_12_MG1655.annotation.fixed.gtf into IGV
  2. +
+
    +
  • File / Load from File…
  • +
  • Select the annotation file. The positions of the genes are now +loaded.
  • +
+
    +
  1. Load the three bam files (FNR_IP_ChIP-seq_Anaerobic_A.bam, +FNR_IP_ChIP-seq_Anaerobic_B.bam and Anaerobic_INPUT_DNA.bam) in +IGV.
  2. +
+
    +
  • File / Load from File…
  • +
  • Select the bam files. alt text
  • +
+

Browse around in the genome. Specifically go to the following +genes: pepT (geneID:b1127), ycfP (geneID:b1108). Do you see peaks (add +screenshots to this report).

+

However, looking at BAM file as such does not allow to directly +compare the two samples as data are not normalized. Let’s generate +normalized data for visualization.

+
+
+

8.3 - Viewing scaled +data

+

bamCoverage +from deepTools generates BigWigs out of BAM files 1. Try it out

+
## Load deeptools in your environment
+module load deeptools/3.5.4
+## run bamCoverage
+bamCoverage --help
+
    +
  1. Create a directory named 04-Visualization to store +bamCoverage outputs
  2. +
+
mkdir 04-Visualization
+
    +
  1. Go to the newly created directory
  2. +
+
cd 04-Visualization
+

Your directory structure should be like this:

+
/shared/projects/<your_project>/EBAII2024_chipseq
+│
+└───data
+│   
+└───01-QualityControl
+│   
+└───02-Mapping
+|    └───index
+|    └───bam
+│   
+└───03-ChIPQualityControls
+│   
+└───04-Visualization <- you should be in this folder
+
    +
  1. Generate a scaled bigwig file on the IP with bamCoverage
  2. +
+
    +
  • –bam: BAM file to process
  • +
  • –outFileName: output file name
  • +
  • –outFileFormat: output file type
  • +
  • –effectiveGenomeSize : size of the mappable genome
  • +
  • –normalizeUsing : different overall normalization methods; we will +use RPGC method corresponding to 1x average coverage
  • +
  • –skipNonCoveredRegions: skip non-covered regions
  • +
  • –extendReads 200: Extend reads to fragment size
  • +
  • –ignoreDuplicates: reads that have the same orientation and start +position will be considered only once
  • +
+
bamCoverage \
+  --bam ../02-Mapping/bam/Marked_FNR_IP_ChIP-seq_Anaerobic_A.bam \
+  --outFileName FNR_IP_ChIP-seq_Anaerobic_A_nodup.bw \
+  --outFileFormat bigwig \
+  --effectiveGenomeSize 4639675 \
+  --normalizeUsing CPM \
+  --skipNonCoveredRegions \
+  --extendReads 200 \
+  --ignoreDuplicates
+
    +
  1. Do it for the replicate and the control.
  2. +
  3. Download the three bigwig files you have just generated
  4. +
+
    +
  • 04-Visualization/FNR_IP_ChIP-seq_Anaerobic_A_nodup.bw
  • +
  • 04-Visualization/FNR_IP_ChIP-seq_Anaerobic_B_nodup.bw
    +
  • +
  • 04-Visualization/Anaerobic_INPUT_DNA_nodup.bw
  • +
+
    +
  1. Load the three bigwig files in IGV
  2. +
+
    +
  • File / Load from File…
  • +
  • Select the three bigwig files.
  • +
+
    +
  1. Set the visualization of the three bigwig files to be +autoscaled
  2. +
+
    +
  • Click right on the name of the tracks and select +Autoscale
  • +
  • Click right on the name of the tracks and set the windowing function +to Maximum
  • +
+

Go back to the genes we looked at earlier: pepT, ycfP (add +screenshots to this report). Look at the shape of the +signal.
+Keep IGV opened.

+

Go back to working home directory (i.e +/shared/projects//EBAII2024_chipseq)

+
## If you are in 04-Visualization
+cd ..
+
+
+
+

9 - Peak calling with +MACS2

+

Goal: Detect the peaks which are regions with high +densities of reads and that correspond to where the studied factor was +bound

+
+

9.1 - Choosing a +peak-calling program

+

There are multiple programs to perform the peak-calling step. Some +are more directed towards histone marks (broad peaks) while others are +specific to transcription factors which present narrow peaks. Here we +will use the callpeak function of MACS2 (version 2.2.7.1) because it’s +known to produce generally good results, and it is well-maintained by +the developer.

+
+
+

9.2 - Calling the +peaks

+
    +
  1. Create a directory named 05-PeakCalling and one +directory named replicates within to store peaks +coordinates.
  2. +
+
mkdir 05-PeakCalling
+mkdir 05-PeakCalling/replicates
+
    +
  1. Go to the newly created directory replicates
  2. +
+
cd 05-PeakCalling/replicates
+
    +
  1. Try out MACS2
  2. +
+
## Load macs2 in your environment
+module load macs2/2.2.7.1
+macs2 callpeak --help
+

This prints the help of the program.

+
    +
  1. Let’s see the parameters of MACS before launching the mapping:
  2. +
+
    +
  • ChIP-seq tag file (-t) is the name of our experiment (treatment) +mapped read file FNR_IP_ChIP-seq_Anaerobic_A.bam
  • +
  • ChIP-seq control file (-c) is the name of our input (control) mapped +read file Anaerobic_INPUT_DNA.bam
  • +
  • –format BAM indicates the input file are in BAM format. Other +formats can be specified (SAM,BED…)
  • +
  • –gsize Effective genome size: this is the size of the genome +considered “usable” for peak calling. This value is given by the MACS +developers on their website. It is smaller than the complete genome +because many regions are excluded (telomeres, highly repeated regions…). +The default value is for human (2700000000.0), so we need to change it. +As the value for E. coli is not provided, we will take the complete +genome size 4639675.
  • +
  • –name provides a prefix for the output files. We set this to +FNR_Anaerobic_A, but it could be any name.
  • +
  • –bw The bandwidth is the size of the fragment extracted from the gel +electrophoresis or expected from sonication. By default, this value is +300bp. Usually, this value is indicated in the Methods section of +publications. In the studied publication, a sentence mentions “400bp +fragments (FNR libraries)”. We thus set this value to 400.
  • +
  • –fix-bimodal indicates that in the case where macs2 cannot find +enough paired peaks between the plus strand and minus strand to build +the shifting model, it can bypass this step and use a extension size of +200bp by default.
  • +
  • -p 1e-2 indicates that we report the peaks if their associated +p-value is lower than 1e-2. This is a relaxed threshold as we want to +keep a high number of false positives in our peak set to later compute +the IDR analysis.
  • +
  • &> MACS.out will output the verbosity (=information) in the +file MACS.out
  • +
+
macs2 callpeak \
+  -t ../../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam \
+  -c ../../02-Mapping/bam/Anaerobic_INPUT_DNA.bam \
+  --format BAM \
+  --gsize 4639675 \
+  --name 'FNR_Anaerobic_A' \
+  --bw 400 \
+  --fix-bimodal \
+  -p 1e-2 \
+  &> repA_MACS.out
+
    +
  1. Run macs2 for replicate A and replicate B.

  2. +
  3. In a new directory called pool, run macs2 for the pooled +replicates A and B by giving both bam files as input treatment files +(-t).

  4. +
+
# You should be in 05-PeakCalling
+cd ..
+mkdir pool
+cd pool
+
+# Run macs2 for pooled replicates
+macs2 callpeak \
+  -t ../../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_A.bam \
+     ../../02-Mapping/bam/FNR_IP_ChIP-seq_Anaerobic_B.bam \
+  -c ../../02-Mapping/bam/Anaerobic_INPUT_DNA.bam \
+  --format BAM \
+  --gsize 4639675 \
+  --name 'FNR_Anaerobic_pool' \
+  --bw 400 \
+  --fix-bimodal \
+  -p 1e-2 \
+  &> pool_MACS.out
+
+
+

9.3 - Analyzing MACS +results

+

Look at the files that were created by MACS. Explain the +content of the result files ?
+How many peaks were detected by MACS2 for each sample and in the +pool of samples ?

+
+
+

9.4 - Calling peaks in a +replicate-aware method (IDR)

+

In order to take advantage of having biological replicates, we will +create a combine set of peaks based on the reproducibility of each +individual replicate peak calling. We will use the +Irreproducible Discovery Rate (IDR) algorithm.

+
    +
  1. Create a new directory to store the peak coordinates resulting after +idr analysis
  2. +
+
## You should be 05-PeakCalling
+cd ..
+mkdir idr
+cd idr
+

Your directory structure should be like this:

+
/shared/projects/<your_project>/EBAII2024_chipseq
+│
+└───data
+│   
+└───01-QualityControl
+│   
+└───02-Mapping
+|    └───index
+|    └───bam
+│   
+└───03-ChIPQualityControls
+│   
+└───04-Visualization
+|
+└───05-PeakCalling
+|    └───replicates
+|    └───pool
+|    └───idr <- you should be in this folder
+
    +
  1. Load the module idr and have a look at its parameters
  2. +
+
## Load idr in your environment
+module load idr/2.0.4.2
+idr --help
+
    +
  • –samples : peak files of each individual replicate
  • +
  • –peak-list : the peak file of the pooled replicates, it will be used +as a master peak set to compare with the regions from each +replicates
  • +
  • –input-file-type : format of the peak file, in our case it is +narrowPeak
  • +
  • –output-file : name of the result file
  • +
  • –plot : plot additional diagnosis plot
  • +
+
    +
  1. Run idr
  2. +
+
idr \
+  --samples ../replicates/FNR_Anaerobic_A_peaks.narrowPeak \
+            ../replicates/FNR_Anaerobic_B_peaks.narrowPeak \
+  --peak-list ../pool/FNR_Anaerobic_pool_peaks.narrowPeak \
+  --input-file-type narrowPeak \
+  --output-file FNR_anaerobic_idr_peaks.bed \
+  --plot
+

Add the IDR graph to this report. How many peaks are found +with the IDR method?

+
    +
  1. Remove IDR and MACS2 from your environment and go back to working +home directory (i.e +/shared/projects//EBAII2024_chipseq)
  2. +
+
module unload macs2/2.2.7.1
+module unload idr/2.0.4.2
+
+## If you are in 05-PeakCalling/idr
+cd ../..
+
+
+

9.5 - Visualize peaks +into IGV

+
    +
  1. Download the following BED files from the server into your computer +to visualise in IGV :
  2. +
+
    +
  • 05-PeakCalling/replicates/FNR_Anaerobic_A_peaks.narrowPeak
  • +
  • 05-PeakCalling/replicates/FNR_Anaerobic_B_peaks.narrowPeak
  • +
  • 05-PeakCalling/pool/FNR_Anaerobic_pool_peaks.narrowPeak
  • +
  • 05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed
  • +
+

Go back again to the genes we looked at earlier: pepT, ycfP. +Do you see peaks (add the 2 screenshots to this report)? +Navigate throught the genome to find peaks detected in the +replicates (peak calling per replicate) and not found/kept with the IDR +method

+

From now on, peak set we keep is the IDR peak +set.

+
+
+
+

10 - Motif analysis

+

Goal: Define binding motif(s) for the ChIPed +transcription factor and identify potential cofactors

+
+

10.1 - Retrieve the peak +sequences corresponding to the peak coordinate file (BED)

+

For the motif analysis, you first need to extract the sequences +corresponding to the peaks. There are several ways to do this (as +usual…). If you work on a UCSC-supported organism, the easiest is to use +RSAT fetch-sequences or Galaxy. Here, we will use Bedtools (Quinlan and Hall 2010), as we have the genome +of interest on our computer (Escherichia_coli_K12.fasta). 1. Create a +directory named 06-MotifAnalysis to store data needed +for motif analysis

+
mkdir 06-MotifAnalysis
+
    +
  1. Go to the newly created directory
  2. +
+
cd 06-MotifAnalysis
+

Your directory structure should be like this:

+
/shared/projects/<your_project>/EBAII2024_chipseq
+│
+└───data
+│   
+└───01-QualityControl
+│   
+└───02-Mapping
+|    └───index
+|    └───bam
+│   
+└───03-ChIPQualityControls
+│   
+└───04-Visualization
+│   
+└───05-PeakCalling
+│   
+└───06-MotifAnalysis <- you should be in this folder
+
    +
  1. Extract peak sequence in fasta format
  2. +
+
## First load samtools
+module load samtools/1.18
+## Create an index of the genome fasta file
+samtools faidx ../data/Escherichia_coli_K12.fasta
+
+## First load bedtools
+module load bedtools/2.30.0
+## Extract fasta sequence from genomic coordinate of peaks
+bedtools getfasta \
+  -fi ../data/Escherichia_coli_K12.fasta \
+  -bed ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed \
+  -fo FNR_anaerobic_idr_peaks.fa
+
    +
  1. Download the file FNR_anaerobic_idr_peaks.fa on your computer
  2. +
+
+
+

10.2 - Motif discovery +with RSAT

+
    +
  1. Open a connection to a Regulatory Sequence Analysis Tools server. +You can choose between various website mirrors.
  2. +
+ +
    +
  1. In the left menu, click on NGS ChIP-seq and then +click on peak-motifs. A new page opens, with a +form
  2. +
  3. The default peak-motifs web form only displays the essential +options. There are only two mandatory parameters.
  4. +
+
    +
  • The title box, which you will set as FNR +Anaerobic . The sequences, that you will +upload from your computer, by clicking on the button +Choose file, and select the file +FNR_anaerobic_idr_peaks.fa from your computer.
  • +
+
    +
  1. We will now modify some of the advanced options in order to +fine-tune the analysis according to your data set.
  2. +
+
    +
  • Open the “Reduce peak sequences” title, and make sure the +Cut peak sequences: +/- option is set to +0 (we wish to analyze our full dataset)
  • +
  • Open the “Motif Discovery parameters” title, and check the +oligomer sizes 6 and 7 (but not 8). Check “Discover +over-represented spaced word pairs +[dyad-analysis]
  • +
  • Under “Compare discovered motifs with databases”, add +RegulonDB prokaryotes (2015_08) as the studied organism is the +bacteria E. coli.
  • +
+
    +
  1. Click “GO”.
  2. +
  3. The Web page displays a link, You can already click on this link. +The report will be progressively updated during the processing of the +workflow.
  4. +
+

Is there anything interesting in RSAT results? If so, which +motif is of interest and why (add screenshot of the +results).

+
+
+
+

11 - Peak annotation

+

Goals: Associate ChIP-seq peaks to genomic features, +identify closest genes and run ontology analyses

+
    +
  1. Create a directory named 07-PeakAnnotation
  2. +
+
# aller dans le répertoire si besoin
+cd ..
+
+mkdir 07-PeakAnnotation
+
    +
  1. Go to the newly created directory
  2. +
+
cd 07-PeakAnnotation
+
+

11.1 - Associate peaks to +closest genes

+

annotatePeaks.pl +from the Homer suite (Heinz et al. 2010) +associates peaks with nearby genes.

+
    +
  1. Create a file suitable for annotatePeaks.pl. To run the tool needs a +peak bed file composed of 6 fields (chr, start, end, name, score, +strand). The 5 first columns of the file +../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed are good but all other +colums are of no use and the strand is missing. To generate a file with +a correct format, we are using the tool cut to select fields 1 to 5 of +the peak file and we add a “+” to every line using awk (this is code +example that can do what we want, not the only solution to do +so.).
  2. +
+
cut \
+  -f 1-5 \
+  ../05-PeakCalling/idr/FNR_anaerobic_idr_peaks.bed | \
+  awk -F "\t" '{print $0"\t+"}' \
+  > FNR_anaerobic_idr_peaks.bed
+
    +
  1. Try annotatePeaks.pl
  2. +
+
## First load bedtools
+module load homer/4.11
+
+## run Homer annotatePeaks
+annotatePeaks.pl --help
+

Let’s see the parameters:

+

annotatePeaks.pl peak/BEDfile genome > outputfile User defined +annotation files (default is UCSC refGene annotation): annotatePeaks.pl +accepts GTF (gene transfer formatted) files to annotate positions +relative to custom annotations, such as those from de novo transcript +discovery or Gencode.

+
    -gtf <gtf format file> (Use -gff and -gff3 if appropriate, but GTF is better)
+
    +
  1. Annotation peaks with nearby genes with Homer
  2. +
+
annotatePeaks.pl \
+  FNR_anaerobic_idr_peaks.bed \
+  ../data/Escherichia_coli_K12.fasta \
+  -gtf ../data/Escherichia_coli_K_12_MG1655.annotation.fixed.gtf \
+  > FNR_anaerobic_idr_annotated_peaks.tsv
+

Look at the file you generated. Gene symbols are not present. +Let’s add them with some R code.

+
    +
  1. Launch Rstudio in ondemand +alt text

  2. +
  3. Add gene symbol annotation using R with Rstudio

  4. +
+
## set working directory
+setwd("/shared/projects/<your_project>/EBAII2024_chipseq/07-PeakAnnotation")
+## Or navigate using the "Files" tab and click on "More">"Set as Working Directory"
+
+## read the file with peaks annotated with homer
+## data are loaded into a data frame
+## sep="\t": this is a tab separated file
+## header=TRUE: there is a line with headers (ie. column names)
+d <- read.table("FNR_anaerobic_idr_annotated_peaks.tsv", sep="\t", header=TRUE)
+
+## Load a 2-columns files which contains in the first column gene IDs
+## and in the second column gene symbols
+## data are loaded into a data frame
+## header=FALSE: there is no header line
+gene.symbol <- read.table("../data/Escherichia_coli_K_12_MG1655.annotation.tsv.gz", header=FALSE)
+
+## Merge the 2 data frames based on a common field
+## by.x gives the columns name in which the common field is for the d data frame
+## by.y gives the columns name in which the common field is for the gene.symbol data frame
+## d contains several columns with no information. We select only interesting columns
+d.annot <- merge(d[,c(1,2,3,4,5,6,8,10,11)], gene.symbol, by.x="Nearest.PromoterID", by.y="V1")
+
+## Change column names of the resulting data frame
+colnames(d.annot)[2] <- "PeakID"  # name the 2d column of the new file "PeakID"
+colnames(d.annot)[dim(d.annot)[2]] <- "Gene.Symbol"
+
+## output the merged data frame to a file named "FNR_anaerobic_idr_final_peaks_annotation.tsv"
+## col.names=TRUE: output column names
+## row.names=FALSE: don't output row names
+## sep="\t": table fields are separated by tabs
+## quote=FALSE: don't put quote around text.
+write.table(d.annot, "FNR_anaerobic_idr_final_peaks_annotation.tsv", col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE)
+

What information is listed in each column of the file? (print +column names and explain them)

+

How many genes are associated to the “promoter-TSS” +feature?

+

What are all the possible gene features? (see in column +Annotation - extract information like promoter-TSS, TSS, …). Create a +plot (pie chart, barplot…) showing the proportion of each of them +(include both the plot and the code that created it in the +report).

+
    +
  1. Go back to working home directory (i.e +/shared/projects/training/<login>/M2.2-BIMS-epigenomique)
  2. +
+
## If you are in 07-PeakAnnotation
+cd ..
+
+
+

11.2 - Search for +Biological Processes, Molecular Functions or Cellular Compartments +enrichment

+

Use Official gene symbols of the file +FNR_anaerobic_idr_final_peaks_annotation.tsv to search for enriched gene +ontologies with the tool DAVID (Database for Annotation, Visualization +and Integrated Discovery). Input your gene list on the DAVID website: https://david.ncifcrf.gov/. Use DAVID convert ID +tool if needed

+

Are there biological processes enriched in the list of genes +associated to the peaks? Show the top results of the Functional +Annotation Clustering. Are these genes enriched in some +KEGG pathway? Which ones?

+
+
+
+

12 - Bonus: Annotation of +ChIP-peaks using R tools

+

In this part, we will use a different set of peaks obtained +using a peak caller from a set of p300 ChIP-seq experiments in different +mouse embryonic tissues (midbrain, forebrain and limb).

+
+

12.1 - Obtain the bed +files from GEO

+
    +
  1. We will download the already called peak files in bed format from +GEO. Create a new folder and go in it.
  2. +
+
cd /shared/projects/<your_project>/EBAII2024_chipseq
+mkdir 07-PeakAnnotation-bonus
+cd 07-PeakAnnotation-bonus
+
    +
  1. Search for the dataset GSE13845 either using Google +or from the front page of GEO
  2. +
  3. On the description page, find the three GSM files, and click on each +of then
  4. +
  5. On each page, select and download the +GSMxxxxx_p300_peaks.txt.gz file to the newly created folder +(where xxxxx represents the GSM number) You should now have +downloaded 3 files: > GSM348064_p300_peaks.txt.gz (Forebrain) > +GSM348065_p300_peaks.txt.gz (Midbrain) > GSM348066_p300_peaks.txt.gz +(limb)
  6. +
+

Beware: Make sure to check which genome version was used to call +the peaks (remember: this is mouse data!)

+
+
+

12.2 - Performing a first +evaluation of peak sets using R

+

Now, we will use RStudio to perform the rest of the +analysis in R. For the analysis, we will need some R/Bioconductor +libraries

+ +
    +
  1. Go to Rstudio and execute the R code below (show results in the +report)
  2. +
+
# load the required libraries
+library(RColorBrewer)
+library(ChIPseeker)
+library(TxDb.Mmusculus.UCSC.mm9.knownGene)
+library(org.Mm.eg.db)
+# define the annotation of the mouse genome
+txdb = TxDb.Mmusculus.UCSC.mm9.knownGene
+# define colors
+col = brewer.pal(9,'Set1')
+
    +
  1. read the peak files for the three datasets:
  2. +
+
# set the working directory to the folder in which the peaks are stored
+setwd("/shared/projects/<your_project>/EBAII2024_chipseq/07-PeakAnnotation-bonus")
+# read the peaks for each dataset
+peaks.forebrain = readPeakFile('GSM348064_p300_peaks.txt.gz')
+peaks.midbrain = readPeakFile('GSM348065_p300_peaks.txt.gz')
+peaks.limb = readPeakFile('GSM348066_p300_peaks.txt.gz')
+
# create a list containing all the peak sets
+all.peaks = list(forebrain=peaks.forebrain,
+midbrain=peaks.midbrain,
+limb=peaks.limb)
+

The peaks are stored as GenomicRanges object; this +is an R format which look like the bed format, but is optimized in terms +of memory requirements and speed of execution.

+

We can start by computing some basic statistics on the peak sets.

+
+

12.2.1 - How many +peaks?

+
# check the number of peaks for the forebrain dataset
+length(peaks.forebrain)
+
## [1] 2453
+
# compute the number of peaks for all datasets using the list object
+sapply(all.peaks,length)
+
## forebrain  midbrain      limb 
+##      2453       561      2105
+
# display this as a barplot
+barplot(sapply(all.peaks,length),col=col)
+

+
+
+

12.2.2 - How large are +these peaks?

+
# statistics on the peak length for forebrain
+summary(width(peaks.forebrain))
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   276.0   551.0   751.0   815.9  1001.0  2701.0
+
# size distribution of the peaks
+peaks.width = lapply(all.peaks,width)
+lapply(peaks.width,summary)
+
## $forebrain
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   276.0   551.0   751.0   815.9  1001.0  2701.0 
+## 
+## $midbrain
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##     276     526     676     717     876    2126 
+## 
+## $limb
+##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   276.0   476.0   601.0   682.6   826.0  2301.0
+
# boxplot of the sizes
+boxplot(peaks.width,col=col)
+

+
+
+

12.2.3 - What is the +score of these peaks?

+

Can you adapt the previous code to display a boxplot of the peak +score distribution for the Forebrain peak set (column +Maximum.Peak.Height)?

+
+
+

12.2.4 - Where are the +peaks located?

+

We can now display the genomic distribution of the peaks along the +chromosomes, including the peak scores, using the covplot +function from ChIPSeeker:

+
# genome wide distribution
+covplot(peaks.forebrain, weightCol="Maximum.Peak.Height")
+

+

Exercice: use the option “lower” in covplot to display only +the peaks with a score (Max.Peak.Height) above 10

+
+
+

12.2.5 - How does the +signal look like at TSS?

+

In addition to the genome wide plot, we can check if there is a +tendency for the peaks to be located close to gene promoters.

+
# define gene promoters
+promoter = getPromoters(TxDb=txdb, upstream=5000, downstream=5000)
+
+# compute the density of peaks within the promoter regions
+tagMatrix = getTagMatrix(peaks.limb, windows=promoter)
+
## >> preparing start_site regions by gene... 2024-11-19 09:52:26
+## >> preparing tag matrix...  2024-11-19 09:52:26
+
# plot the density
+tagHeatmap(tagMatrix, palette = "RdYlBu")
+

+
+
+
+

12.3 - Functional +annotation of the peaks

+

We can now assign the peaks to the closest genes and genomic +compartments (introns, exons, promoters, distal regions, etc…) This is +done using the function annotatePeak which compares the +peak files with the annotation file of the mouse genome. This function +returns a complex object which contains all this information.

+
peakAnno.forebrain = annotatePeak(peaks.forebrain, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db")
+
## >> preparing features information...      2024-11-19 09:52:47 
+## >> identifying nearest features...        2024-11-19 09:52:47 
+## >> calculating distance from peak to TSS...   2024-11-19 09:52:47 
+## >> assigning genomic annotation...        2024-11-19 09:52:47 
+## >> adding gene annotation...          2024-11-19 09:52:52
+
## 'select()' returned 1:many mapping between keys and columns
+
## >> assigning chromosome lengths           2024-11-19 09:52:52 
+## >> done...                    2024-11-19 09:52:52
+
peakAnno.midbrain = annotatePeak(peaks.midbrain, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db")
+
## >> preparing features information...      2024-11-19 09:52:52 
+## >> identifying nearest features...        2024-11-19 09:52:52 
+## >> calculating distance from peak to TSS...   2024-11-19 09:52:52 
+## >> assigning genomic annotation...        2024-11-19 09:52:52 
+## >> adding gene annotation...          2024-11-19 09:52:52
+
## 'select()' returned 1:1 mapping between keys and columns
+
## >> assigning chromosome lengths           2024-11-19 09:52:52 
+## >> done...                    2024-11-19 09:52:52
+
peakAnno.limb = annotatePeak(peaks.limb, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db")
+
## >> preparing features information...      2024-11-19 09:52:52 
+## >> identifying nearest features...        2024-11-19 09:52:52 
+## >> calculating distance from peak to TSS...   2024-11-19 09:52:52 
+## >> assigning genomic annotation...        2024-11-19 09:52:52 
+## >> adding gene annotation...          2024-11-19 09:52:53
+
## 'select()' returned 1:many mapping between keys and columns
+
## >> assigning chromosome lengths           2024-11-19 09:52:53 
+## >> done...                    2024-11-19 09:52:53
+
+

12.3.1 - genomic +localization

+

We can now analyze more in details the localization of the peaks +(introns, exons, promoters, distal regions,…)

+
# distribution of genomic compartments for forebrain peaks
+plotAnnoPie(peakAnno.forebrain)
+

+
# for all the peaks
+plotAnnoBar(list(forebrain=peakAnno.forebrain, midbrain=peakAnno.midbrain,limb=peakAnno.limb))
+

+

Question: do you see differences between the three peak +sets?

+
+
+

12.3.2 - functional +annotation

+

An important step in ChIP-seq analysis is to interpret genes that are +located close to the ChIP peaks. Hence, we need to 1. assign genes to +peaks 2. compute functional enrichments of the target genes.

+

Beware: By doing so, we assume that the target gene +of the peak is always the closest one. Hi-C/4C analysis have shown that +in higher eukaryotes, this is not always the case. However, in the +absence of data on the real target gene of ChIP-peaks, we can work with +this approximation.

+

We will compute the enrichment of the Gene Ontology “Biological +Process” categories in the set of putative target genes.

+
# load the library
+library(clusterProfiler)
+
# define the list of all mouse genes as a universe for the enrichment analysis
+universe = mappedkeys(org.Mm.egACCNUM)
+
+## extract the gene IDs of the forebrain target genes
+genes.forebrain = peakAnno.forebrain@anno$geneId
+ego.forebrain = enrichGO(gene          = genes.forebrain,
+                universe      = universe,
+                OrgDb         = org.Mm.eg.db,
+                ont           = "BP",
+                pAdjustMethod = "BH",
+                pvalueCutoff  = 0.01,
+                qvalueCutoff  = 0.05,
+        readable      = TRUE)
+
+# display the results as barplots        
+barplot(ego.forebrain,showCategory=10)
+

+

Question: do you see an enrichment of the expected +categories? What does the x-axis mean? What does the color +mean?

+

Exercise: redo this analysis for the limb dataset +and check if the enriched categories make sense.

+
+
+
+

12.4 FAQ

+
+

12.4.1 How to download +the data

+

Goal: Identify the datasets corresponding to the +studied article and retrieve the data (reads as FASTQ files) +corresponding to 2 replicates of a condition and the corresponding +control.

+
+

12.4.1.1 - Obtaining an +identifier for a chosen dataset

+

NGS datasets are (usually) made freely accessible for other +scientists, by depositing these datasets into specialized databanks. Sequence Read Archive (SRA) +located in USA hosted by NCBI, and its European equivalent European Nucleotide Archive (ENA) +located in England hosted by EBI both contains raw +reads.

+

Functional genomic datasets (transcriptomics, genome-wide binding +such as ChIP-seq,…) are deposited in the databases Gene Expression Omnibus +(GEO) or its European equivalent ArrayExpress.

+

Within an article of interest, search for a sentence mentioning the +deposition of the data in a database. Here, the following sentence can +be found at the end of the Materials and Methods section: “All +genome-wide data from this publication have been deposited in NCBI’s +Gene Expression Omnibus (GSE41195).” We will thus +use the GSE41195 identifier to retrieve the dataset +from the NCBI GEO (Gene Expression Omnibus) +database.

+
+
+

12.4.1.2 - Accessing +GSE41195 from GEO

+
    +
  1. The GEO database hosts processed data files and many details related +to the experiments. SRA (Sequence Read Archive) stores the actual raw +sequence data.
  2. +
  3. Search in Google GSE41195. Click on the first link +to directly access the correct page on the GEO database. alt text
  4. +
  5. This GEO entry is a mixture of expression analysis (Nimblegen Gene +Expression Array), chip-chip and chip-seq. At the bottom of the page, +click on the subseries related to the chip-seq datasets. (this subseries +has its own identifier: GSE41187). alt text
  6. +
  7. From this page, we will focus on the experiment FNR IP +ChIP-seq Anaerobic A. At the bottom of the page, click on the +link “GSM1010219 - FNR IP ChIP-seq Anaerobic A”.
  8. +
  9. In the new page, go to the bottom to find the SRA identifier. This +is the identifier of the raw dataset stored in the SRA database.
    +alt text
  10. +
  11. Copy the identifier SRX189773 (do not click on the +link that would take you to the SRA database, see below why)
  12. +
+
+
+

12.4.1.3 - Downloading +FASTQ file from the ENA database

+

Although direct access to the SRA database at the NCBI is doable, SRA +does not store sequences in a FASTQ format. So, in practice, it’s +simpler (and quicker!!) to download datasets from the ENA database +(European Nucleotide Archive) hosted by EBI (European Bioinformatics +Institute) in UK. ENA encompasses the data from SRA.

+
    +
  1. Go to the EBI website. Paste +your SRA identifier (SRX189773) and click on the button “search”. alt text
  2. +
  3. Click on the first result. On the next page, there is a link to the +FASTQ file. For efficiency, this file has already been downloaded and is +available in the “data” folder +(FNR_IP_ChIP-seq_Anaerobic_A.fastq.gz)
    +alt text
  4. +
+

tip: To download the replicate and control datasets, +we should redo the same steps starting from the GEO web page specific to +the chip-seq datasets (see step 2.4) and choose FNR IP ChIP-seq +Anaerobic B and anaerobic INPUT DNA. +Downloaded FASTQ files are available in the data folder +(FNR_IP_ChIP-seq_Anaerobic_B.fastq.gz and Anaerobic_INPUT_DNA.fastq.gz +respectively)

+

At this point, you have three FASTQ files, two IPs, one +control (INPUT).

+
+
+
+

12.4.2 How to extract +peaks from the supplementary data of a publication ?

+

The processed peaks (BED file) is sometimes available on the GEO +website, or in supplementary data. Unfortunately, most of the time, the +peak coordinates are embedded into supplementary tables and thus not +usable “as is”. This is the case for the studied article. To be able to +use these peaks (visualize them in a genome browser, compare them with +the peaks found with another program, perform downstream analyses…), you +will need to (re)-create a BED file from the information available. +Here, Table S5 provides the coordinates of the summit of the peaks. The +coordinates are for the same assembly as we used.

+
    +
  1. copy/paste the first column into a new file, and save it as +retained_peaks.txt
  2. +
  3. use a PERL command (or awk if you know this language) to create a +BED-formatted file. As we need start and end coordinates, we will +arbitrarily take +/-50bp around the summit.
  4. +
+
perl -lane 'print "gi|49175990|ref|NC_000913.2|\t".($F[0]-50)."\t".($F[0]+50)."\t" ' retained_peaks.txt > retained_peaks.bed
+
    +
  1. The BED file looks like this: > gi|49175990|ref|NC_000913.2| 120 +220 > gi|49175990|ref|NC_000913.2| 20536 20636 > +gi|49175990|ref|NC_000913.2| 29565 29665 > +gi|49175990|ref|NC_000913.2| 34015 34115
  2. +
  3. Depending on the available information, the command will be +different.
  4. +
+
+
+

12.4.3 - How to obtain +the annotation (=Gene) GTF file for IGV?

+

Annotation files can be found on genome websites, NCBI FTP server, +Ensembl, … However, IGV required GFF format, or BED format, which are +often not directly available. Here, I downloaded the annotation from the +UCSC +Table browser as “Escherichia_coli_K_12_MG1655.annotation.gtf”. +Then, I changed the “chr” to the name of our genome with the following +PERL command:

+
perl -pe 's/^chr/gi\|49175990\|ref\|NC_000913.2\|/' Escherichia_coli_K_12_MG1655.annotation.gtf > Escherichia_coli_K_12_MG1655.annotation.fixed.gtf
+

This file will work directly in IGV

+
+
+
+
+

References

+
+
+Heinz, Sven, Christopher Benner, Nathanael Spann, Eric Bertolino, Yin C. +Lin, Peter Laslo, Jason X. Cheng, Cornelis Murre, Harinder Singh, and +Christopher K. Glass. 2010. “Simple Combinations of +Lineage-Determining Transcription Factors Prime Cis-Regulatory Elements +Required for Macrophage and b Cell Identities.” Molecular +Cell 38 (4): 576–89. https://doi.org/10.1016/j.molcel.2010.05.004. +
+
+Langmead, Ben, and Steven L Salzberg. 2012. “Fast Gapped-Read +Alignment with Bowtie 2.” Nature Methods 9 +(4): 357–59. https://doi.org/10.1038/nmeth.1923. +
+
+Langmead, Ben, Cole Trapnell, Mihai Pop, and Steven L. Salzberg. 2009. +“Ultrafast and Memory-Efficient Alignment of Short +DNA Sequences to the Human Genome.” Genome +Biology 10 (3): R25. https://doi.org/10.1186/gb-2009-10-3-r25. +
+
+Li, Heng, Bob Handsaker, Alec Wysoker, Tim Fennell, Jue Ruan, Nils +Homer, Gabor Marth, Goncalo Abecasis, and Richard Durbin. 2009. +“The Sequence Alignment/Map +Format and SAMtools.” Bioinformatics 25 +(16): 2078–79. https://doi.org/10.1093/bioinformatics/btp352. +
+
+“Picard Tools - By Broad +Institute.” n.d. Accessed August 26, 2016. http://broadinstitute.github.io/picard/. +
+
+Quinlan, Aaron R, and Ira M Hall. 2010. “BEDTools: A Flexible +Suite of Utilities for Comparing Genomic Features.” +Bioinformatics 26 (6): 841–42. https://doi.org/10.1093/bioinformatics/btq033. +
+
+Ramírez, Fidel, Devon P. Ryan, Björn Grüning, Vivek Bhardwaj, Fabian +Kilpert, Andreas S. Richter, Steffen Heyne, Friederike Dündar, and +Thomas Manke. 2016. deepTools2: A +Next Generation Web Server for Deep-Sequencing Data Analysis.” +Nucleic Acids Research 44 (W1): W160–65. https://doi.org/10.1093/nar/gkw257. +
+
+Yu, Guangchuang, Li-Gen Wang, Yanyan Han, and Qing-Yu He. 2012. +“clusterProfiler: An r Package for Comparing Biological Themes +Among Gene Clusters.” OMICS: A Journal of Integrative +Biology 16 (5): 284–87. https://doi.org/10.1089/omi.2011.0118. +
+
+Yu, Guangchuang, Li-Gen Wang, and Qing-Yu He. 2015. “ChIPseeker: +An r/Bioconductor Package for ChIP Peak Annotation, Comparison and +Visualization.” Bioinformatics 31 (14): 2382–83. https://doi.org/10.1093/bioinformatics/btv145. +
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/2024/ebaiin1/chip-seq/hands-on/references.bib b/2024/ebaiin1/chip-seq/hands-on/references.bib new file mode 100755 index 0000000..544c3f5 --- /dev/null +++ b/2024/ebaiin1/chip-seq/hands-on/references.bib @@ -0,0 +1,746 @@ +@Article{AND2010, +author = {Anders and Huber}, +title = {Differential expression analysis for sequence count data}, +journal = {Genome Biology}, +year = {2010}, +volume = {11} +} + +@Article{BEN1995, +author = {Benjamini and Hochberg}, +title = {Controlling the false discovery rate: a practical and powerful approach to multiple testing}, +journal = {Journal of the Royal Statistical Society}, +year = {1995}, +volume = {57}, +pages = {289-300} +} + +@Article{LAN2009, +author = {Langmead and Trapnell and Pop and Salzberg}, +title = {Ultrafast and memory-efficient alignment of short DNA sequences to the human genome}, +journal = {Genome Biology}, +year = {2009}, +volume = {10} +} + +@Article{TRA2009, +author = {Trapnell and Pachter and Salzberg}, +title = {TopHat: discovering splice junctions with RNA-Seq}, +journal = {Bioinformatics}, +year = {2009}, +volume = {25}, +pages = {1105-1111} +} + + +@article{Schulze2012, + Abstract = {Assessing the reliability of experimental replicates (or global alterations corresponding to different experimental conditions) is a critical step in analyzing RNA-Seq data. Pearson's correlation coefficient r has been widely used in the RNA-Seq field even though its statistical characteristics may be poorly suited to the task.}, + Author = {Schulze, Stefan K. and Kanwar, Rahul and G{\"o}lzenleuchter, Meike and Therneau, Terry M. and Beutler, Andreas S.}, + Doi = {10.1186/1471-2164-13-524}, + Issn = {1471-2164}, + Journal = {BMC Genomics}, + Number = {1}, + Pages = {524}, + Title = {SERE: Single-parameter quality control and sample comparison for RNA-Seq}, + Url = {http://dx.doi.org/10.1186/1471-2164-13-524}, + Volume = {13}, + Year = {2012}, + Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2164-13-524}} + + +@article{Love2014, + Abstract = {In comparative high-throughput sequencing assays, a fundamental task is the analysis of count data, such as read counts per gene in RNA-seq, for evidence of systematic changes across experimental conditions. Small replicate numbers, discreteness, large dynamic range and the presence of outliers require a suitable statistical approach. We present DESeq2, a method for differential analysis of count data, using shrinkage estimation for dispersions and fold changes to improve stability and interpretability of estimates. This enables a more quantitative analysis focused on the strength rather than the mere presence of differential expression. The DESeq2 package is available at http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html .}, + Author = {Love, Michael I. and Huber, Wolfgang and Anders, Simon}, + Doi = {10.1186/s13059-014-0550-8}, + Issn = {1474-760X}, + Journal = {Genome Biology}, + Number = {12}, + Pages = {550}, + Title = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2}, + Url = {http://dx.doi.org/10.1186/s13059-014-0550-8}, + Volume = {15}, + Year = {2014}, + Bdsk-Url-1 = {http://dx.doi.org/10.1186/s13059-014-0550-8}} + + +@article{Anders2015, + Abstract = {Motivation: A large choice of tools exists for many standard tasks in the analysis of high-throughput sequencing (HTS) data. However, once a project deviates from standard workflows, custom scripts are needed. Results: We present HTSeq, a Python library to facilitate the rapid development of such scripts. HTSeq offers parsers for many common data formats in HTS projects, as well as classes to represent data, such as genomic coordinates, sequences, sequencing reads, alignments, gene model information and variant calls, and provides data structures that allow for querying via genomic coordinates. We also present htseq-count, a tool developed with HTSeq that preprocesses RNA-Seq data for differential expression analysis by counting the overlap of reads with genes. Availability and implementation: HTSeq is released as an open-source software under the GNU General Public Licence and available from http://www-huber.embl.de/HTSeq or from the Python Package Index at https://pypi.python.org/pypi/HTSeq. Contact: sanders@fs.tum.de}, + An = {PMC4287950}, + Author = {Anders, Simon and Pyl, Paul Theodor and Huber, Wolfgang}, + Year = {2015}, + Db = {PMC}, + Doi = {10.1093/bioinformatics/btu638}, + Isbn = {1367-4803; 1367-4811}, + J1 = {Bioinformatics}, + Journal = {Bioinformatics}, + Month = {01}, + Number = {2}, + Pages = {166--169}, + Publisher = {Oxford University Press}, + Title = {HTSeq---a Python framework to work with high-throughput sequencing data}, + Ty = {JOUR}, + U1 = {btu638{$[$}PII{$]$}; 25260700{$[$}pmid{$]$}}, + Url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4287950/}, + Volume = {31}, + Year = {2015}, + Year1 = {2014/09/25}, + Year2 = {2014/02/27/received}, + Year3 = {2014/08/18/revised}, + Year4 = {2014/09/21/accepted}, + Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4287950/}, + Bdsk-Url-2 = {http://dx.doi.org/10.1093/bioinformatics/btu638} +} + +@article{li_fast_2009, + title = {Fast and accurate short read alignment with {Burrows}–{Wheeler} transform}, + volume = {25}, + issn = {1367-4803, 1460-2059}, + url = {http://bioinformatics.oxfordjournals.org/content/25/14/1754}, + doi = {10.1093/bioinformatics/btp324}, + abstract = {Motivation: The enormous amount of short reads generated by the new DNA sequencing technologies call for the development of fast and accurate read alignment programs. A first generation of hash table-based methods has been developed, including MAQ, which is accurate, feature rich and fast enough to align short reads from a single individual. However, MAQ does not support gapped alignment for single-end reads, which makes it unsuitable for alignment of longer reads where indels may occur frequently. The speed of MAQ is also a concern when the alignment is scaled up to the resequencing of hundreds of individuals. +Results: We implemented Burrows-Wheeler Alignment tool (BWA), a new read alignment package that is based on backward search with Burrows–Wheeler Transform (BWT), to efficiently align short sequencing reads against a large reference sequence such as the human genome, allowing mismatches and gaps. BWA supports both base space reads, e.g. from Illumina sequencing machines, and color space reads from AB SOLiD machines. Evaluations on both simulated and real data suggest that BWA is ∼10–20× faster than MAQ, while achieving similar accuracy. In addition, BWA outputs alignment in the new standard SAM (Sequence Alignment/Map) format. Variant calling and other downstream analyses after the alignment can be achieved with the open source SAMtools software package. +Availability: http://maq.sourceforge.net +Contact: rd@sanger.ac.uk}, + language = {en}, + number = {14}, + urldate = {2013-02-15}, + journal = {Bioinformatics}, + author = {Li, Heng and Durbin, Richard}, + month = jul, + year = {2009}, + keywords = {Aligner, DNA-seq, Tool}, + pages = {1754--1760}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/W4ZZ72KE/Li et Durbin - 2009 - Fast and accurate short read alignment with Burrow.pdf:application/pdf;Snapshot:/Users/steph/Documents/Zotero/storage/HEV5PCIK/1754.html:text/html} +} + +@article{fadloun_chromatin_2013, + title = {Chromatin signatures and retrotransposon profiling in mouse embryos reveal regulation of {LINE}-1 by {RNA}}, + copyright = {© 2013 Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.}, + issn = {1545-9993}, + url = {http://www.nature.com/nsmb/journal/vaop/ncurrent/full/nsmb.2495.html}, + doi = {10.1038/nsmb.2495}, + abstract = {How a more plastic chromatin state is maintained and reversed during development is unknown. Heterochromatin-mediated silencing of repetitive elements occurs in differentiated cells. Here, we used repetitive elements, including retrotransposons, as model loci to address how and when heterochromatin forms during development. RNA sequencing throughout early mouse embryogenesis revealed that repetitive-element expression is dynamic and stage specific, with most repetitive elements becoming repressed before implantation. We show that LINE-1 and IAP retrotransposons become reactivated from both parental genomes after fertilization. Chromatin immunoprecipitation for H3K4me3 and H3K9me3 in 2- and 8-cell embryos indicates that their developmental silencing follows loss of activating marks rather than acquisition of conventional heterochromatic marks. Furthermore, short LINE-1 RNAs regulate LINE-1 transcription in vivo. Our data indicate that reprogramming after mammalian fertilization comprises a robust transcriptional activation of retrotransposons and that repetitive elements are initially regulated through RNA.}, + language = {en}, + urldate = {2013-02-12}, + journal = {Nature Structural \& Molecular Biology}, + author = {Fadloun, Anas and Gras, Stéphanie Le and Jost, Bernard and Ziegler-Birling, Céline and Takahashi, Hazuki and Gorab, Eduardo and Carninci, Piero and Torres-Padilla, Maria-Elena}, + year = {2013}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/NXT2D3UC/Fadloun et al. - 2013 - Chromatin signatures and retrotransposon profiling.pdf:application/pdf} +} + +@article{jurka_repbase_2005, + title = {Repbase {Update}, a database of eukaryotic repetitive elements}, + volume = {110}, + issn = {1424-859X, 1424-8581}, + url = {http://www.karger.com/Article/FullText/84979}, + doi = {10.1159/000084979}, + language = {en}, + number = {1-4}, + urldate = {2014-11-06}, + journal = {Cytogenetic and Genome Research}, + author = {Jurka, J. and Kapitonov, V.V. and Pavlicek, A. and Klonowski, P. and Kohany, O. and Walichiewicz, J.}, + year = {2005}, + pages = {462--467}, + file = {PayPerView\: Repbase Update, a database of eukaryotic repetitive elements - Karger Publishers:/Users/steph/Documents/Zotero/storage/NQE6UA3H/84979.html:text/html} +} + + +@article{bedtools, + Abstract = {Motivation: Testing for correlations between different sets of genomic features is a fundamental task in genomics research. However, searching for overlaps between features with existing web-based methods is complicated by the massive datasets that are routinely produced with current sequencing technologies. Fast and flexible tools are therefore required to ask complex questions of these data in an efficient manner. Results: This article introduces a new software suite for the comparison, manipulation and annotation of genomic features in Browser Extensible Data (BED) and General Feature Format (GFF) format. BEDTools also supports the comparison of sequence alignments in BAM format to both BED and GFF features. The tools are extremely efficient and allow the user to compare large datasets (e.g. next-generation sequencing data) with both public and custom genome annotation tracks. BEDTools can be combined with one another as well as with standard UNIX commands, thus facilitating routine genomics tasks as well as pipelines that can quickly answer intricate questions of large genomic datasets. Availability and implementation: BEDTools was written in C++. Source code and a comprehensive user manual are freely available at http://code.google.com/p/bedtools Contact: aaronquinlan{\char64}gmail.com; imh4y{\char64}virginia.edu Supplementary information: Supplementary data are available at Bioinformatics online.}, + An = {PMC2832824}, + Author = {Quinlan, Aaron R and Hall, Ira M}, + Date = {2010/03/15}, + Date-Added = {2017-04-07 12:57:16 +0000}, + Date-Modified = {2017-04-07 12:57:16 +0000}, + Db = {PMC}, + Doi = {10.1093/bioinformatics/btq033}, + Isbn = {1367-4803; 1367-4811}, + J1 = {Bioinformatics}, + Journal = {Bioinformatics}, + Month = {03}, + Number = {6}, + Pages = {841--842}, + Publisher = {Oxford University Press}, + Title = {BEDTools: a flexible suite of utilities for comparing genomic features}, + Ty = {JOUR}, + U1 = {btq033{$[$}PII{$]$}; 20110278{$[$}pmid{$]$}}, + Url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2832824/}, + Volume = {26}, + Year = {2010}, + Year1 = {2010/01/28}, + Year2 = {2009/11/24/received}, + Year3 = {2010/01/11/revised}, + Year4 = {2010/01/21/accepted}, + Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2832824/}, + Bdsk-Url-2 = {http://dx.doi.org/10.1093/bioinformatics/btq033}} + +%% MACS +@article{zhang_model-based_2008, + title = {Model-based Analysis of {ChIP-Seq} ({MACS)}}, + volume = {9}, + issn = {1465-6906}, + url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2592715/}, + doi = {10.1186/gb-2008-9-9-r137}, + abstract = {{MACS} performs model-based analysis of {ChIP-Seq} data generated by short read sequencers., We present Model-based Analysis of {ChIP-Seq} data, {MACS}, which analyzes data generated by short read sequencers such as Solexa's Genome Analyzer. {MACS} empirically models the shift size of {ChIP-Seq} tags, and uses it to improve the spatial resolution of predicted binding sites. {MACS} also uses a dynamic Poisson distribution to effectively capture local biases in the genome, allowing for more robust predictions. {MACS} compares favorably to existing {ChIP-Seq} peak-finding algorithms, and is freely available.}, + number = {9}, + urldate = {2013-07-11}, + journal = {Genome Biology}, + author = {Zhang, Yong and Liu, Tao and Meyer, Clifford A and Eeckhoute, Jerome and Johnson, David S and Bernstein, Bradley E and Nusbaum, Chad and Myers, Richard M and Brown, Myles and Li, Wei and Liu, X Shirley}, + year = {2008}, + note = {{PMID:} 18798982 +{PMCID:} {PMC2592715}}, + pages = {R137}, + file = {PubMed Central Full Text PDF:/Users/steph/Documents/Zotero/storage/QNZPAM8X/Zhang et al. - 2008 - Model-based Analysis of ChIP-Seq (MACS).pdf:application/pdf} +} + +%% Homer +@article{heinz_simple_2010, + title = {Simple combinations of lineage-determining transcription factors prime cis-regulatory elements required for macrophage and B cell identities}, + volume = {38}, + issn = {1097-2765}, + url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2898526/}, + doi = {10.1016/j.molcel.2010.05.004}, + abstract = {Genome-scale studies have revealed extensive, cell type-specific co-localization of transcription factors, but the mechanisms underlying this phenomenon remain poorly understood. Here we demonstrate in macrophages and B cells that collaborative interactions of the common factor {PU.1} with small sets of macrophage- or B celllineage-determining transcription factors establish cell-specific binding sites that are associated with the majority of promoter-distal {H3K4me1-marked} genomic regions. {PU.1} binding initiates nucleosome remodeling followed by {H3K4} monomethylation at large numbers of genomic regions associated with both broadly and specifically expressed genes. These locations serve as beacons for additional factors, exemplified by liver X receptors, which drive both cell-specific gene expression and signal-dependent responses. Together with analyses of transcription factor binding and {H3K4me1} patterns in other cell types, these studies suggest that simple combinations of lineage-determining transcription factors can specify the genomic sites ultimately responsible for both cell identity and cell type-specific responses to diverse signaling inputs.}, + number = {4}, + urldate = {2013-07-11}, + journal = {Molecular cell}, + author = {Heinz, Sven and Benner, Christopher and Spann, Nathanael and Bertolino, Eric and Lin, Yin C. and Laslo, Peter and Cheng, Jason X. and Murre, Cornelis and Singh, Harinder and Glass, Christopher K.}, + month = may, + year = {2010}, + note = {{PMID:} 20513432 +{PMCID:} {PMC2898526}}, + pages = {576--589}, + file = {PubMed Central Full Text PDF:/Users/steph/Documents/Zotero/storage/DUM3FHZC/Heinz et al. - 2010 - Simple combinations of lineage-determining transcr.pdf:application/pdf} +} + +%% seqMiner +@article{ye_seqminer:_2011, + title = {{seqMINER:} an integrated {ChIP-seq} data interpretation platform}, + volume = {39}, + issn = {0305-1048, 1362-4962}, + shorttitle = {{seqMINER}}, + url = {http://nar.oxfordjournals.org/content/39/6/e35}, + doi = {10.1093/nar/gkq1287}, + abstract = {In a single experiment, chromatin immunoprecipitation combined with high throughput sequencing ({ChIP-seq)} provides genome-wide information about a given covalent histone modification or transcription factor occupancy. However, time efficient bioinformatics resources for extracting biological meaning out of these gigabyte-scale datasets are often a limiting factor for data interpretation by biologists. We created an integrated portable {ChIP-seq} data interpretation platform called {seqMINER}, with optimized performances for efficient handling of multiple genome-wide datasets. {seqMINER} allows comparison and integration of multiple {ChIP-seq} datasets and extraction of qualitative as well as quantitative information. {seqMINER} can handle the biological complexity of most experimental situations and proposes methods to the user for data classification according to the analysed features. In addition, through multiple graphical representations, {seqMINER} allows visualization and modelling of general as well as specific patterns in a given dataset. To demonstrate the efficiency of {seqMINER}, we have carried out a comprehensive analysis of genome-wide chromatin modification data in mouse embryonic stem cells to understand the global epigenetic landscape and its change through cellular differentiation.}, + language = {en}, + number = {6}, + urldate = {2014-03-14}, + journal = {Nucleic Acids Research}, + author = {Ye, Tao and Krebs, Arnaud R. and Choukrallah, Mohamed-Amin and Keime, Celine and Plewniak, Frederic and Davidson, Irwin and Tora, Laszlo}, + month = jan, + year = {2011}, + note = {{PMID:} 21177645}, + pages = {e35--e35}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/S4Z8XJTB/Ye et al. - 2011 - seqMINER an integrated ChIP-seq data interpretati.pdf:application/pdf;Snapshot:/Users/steph/Documents/Zotero/storage/4I4GW65F/e35.html:text/html} +} + +%% MEME +@article{bailey_fitting_1994, + title = {Fitting a mixture model by expectation maximization to discover motifs in biopolymers}, + volume = {2}, + issn = {1553-0833}, + abstract = {The algorithm described in this paper discovers one or more motifs in a collection of {DNA} or protein sequences by using the technique of expectation maximization to fit a two-component finite mixture model to the set of sequences. Multiple motifs are found by fitting a mixture model to the data, probabilistically erasing the occurrences of the motif thus found, and repeating the process to find successive motifs. The algorithm requires only a set of unaligned sequences and a number specifying the width of the motifs as input. It returns a model of each motif and a threshold which together can be used as a Bayes-optimal classifier for searching for occurrences of the motif in other databases. The algorithm estimates how many times each motif occurs in each sequence in the dataset and outputs an alignment of the occurrences of the motif. The algorithm is capable of discovering several different motifs with differing numbers of occurrences in a single dataset.}, + language = {eng}, + journal = {Proceedings / ... International Conference on Intelligent Systems for Molecular Biology ; {ISMB.} International Conference on Intelligent Systems for Molecular Biology}, + author = {Bailey, T L and Elkan, C}, + year = {1994}, + note = {{PMID:} 7584402}, + keywords = {Algorithms, Animals, Biopolymers, Humans, Models, Theoretical, Sequence Analysis}, + pages = {28--36} +} + +%% ChipPeakAnno +@article{zhu_chippeakanno:_2010, + title = {{ChIPpeakAnno:} a Bioconductor package to annotate {ChIP-seq} and {ChIP-chip} data}, + volume = {11}, + copyright = {2010 Zhu et al; licensee {BioMed} Central Ltd.}, + issn = {1471-2105}, + shorttitle = {{ChIPpeakAnno}}, + url = {http://www.biomedcentral.com/1471-2105/11/237/abstract}, + doi = {10.1186/1471-2105-11-237}, + abstract = {Chromatin immunoprecipitation ({ChIP)} followed by high-throughput sequencing ({ChIP-seq)} or {ChIP} followed by genome tiling array analysis ({ChIP-chip)} have become standard technologies for genome-wide identification of {DNA-binding} protein target sites. A number of algorithms have been developed in parallel that allow identification of binding sites from {ChIP-seq} or {ChIP-chip} datasets and subsequent visualization in the University of California Santa Cruz ({UCSC)} Genome Browser as custom annotation tracks. However, summarizing these tracks can be a daunting task, particularly if there are a large number of binding sites or the binding sites are distributed widely across the genome. +{PMID:} 20459804}, + language = {en}, + number = {1}, + urldate = {2014-03-14}, + journal = {{BMC} Bioinformatics}, + author = {Zhu, Lihua J. and Gazin, Claude and Lawson, Nathan D. and Pagès, Hervé and Lin, Simon M. and Lapointe, David S. and Green, Michael R.}, + month = may, + year = {2010}, + note = {{PMID:} 20459804}, + pages = {237}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/B9JRGMAI/Zhu et al. - 2010 - ChIPpeakAnno a Bioconductor package to annotate C.pdf:application/pdf;Snapshot:/Users/steph/Documents/Zotero/storage/GKV3IQMX/237.html:text/html} +} + +@article{zang_clustering_2009, + title = {A clustering approach for identification of enriched domains from histone modification {ChIP-Seq} data}, + volume = {25}, + issn = {1367-4803, 1460-2059}, + url = {http://bioinformatics.oxfordjournals.org/content/25/15/1952}, + doi = {10.1093/bioinformatics/btp340}, + abstract = {Motivation: Chromatin states are the key to gene regulation and cell identity. Chromatin immunoprecipitation ({ChIP)} coupled with high-throughput sequencing ({ChIP-Seq)} is increasingly being used to map epigenetic states across genomes of diverse species. Chromatin modification profiles are frequently noisy and diffuse, spanning regions ranging from several nucleosomes to large domains of multiple genes. Much of the early work on the identification of {ChIP-enriched} regions for {ChIP-Seq} data has focused on identifying localized regions, such as transcription factor binding sites. Bioinformatic tools to identify diffuse domains of {ChIP-enriched} regions have been lacking. +Results: Based on the biological observation that histone modifications tend to cluster to form domains, we present a method that identifies spatial clusters of signals unlikely to appear by chance. This method pools together enrichment information from neighboring nucleosomes to increase sensitivity and specificity. By using genomic-scale analysis, as well as the examination of loci with validated epigenetic states, we demonstrate that this method outperforms existing methods in the identification of {ChIP-enriched} signals for histone modification profiles. We demonstrate the application of this unbiased method in important issues in {ChIP-Seq} data analysis, such as data normalization for quantitative comparison of levels of epigenetic modifications across cell types and growth conditions. +Availability: {http://home.gwu.edu/∼wpeng/Software.htm} +Contact: wpeng@gwu.edu +Supplementary information: Supplementary data are available at Bioinformatics online.}, + language = {en}, + number = {15}, + urldate = {2013-07-11}, + journal = {Bioinformatics}, + author = {Zang, Chongzhi and Schones, Dustin E. and Zeng, Chen and Cui, Kairong and Zhao, Keji and Peng, Weiqun}, + month = jan, + year = {2009}, + note = {{PMID:} 19505939}, + pages = {1952--1958}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/4C5JR2VZ/Zang et al. - 2009 - A clustering approach for identification of enrich.pdf:application/pdf;Snapshot:/Users/steph/Documents/Zotero/storage/HGTE2465/1952.html:text/html} +} + + +@article{love_moderated_2014, + title = {Moderated estimation of fold change and dispersion for {RNA}-seq data with {DESeq}2}, + volume = {15}, + copyright = {2014 Love et al.; licensee BioMed Central.}, + issn = {1465-6906}, + url = {http://genomebiology.com/2014/15/12/550/abstract}, + doi = {10.1186/s13059-014-0550-8}, + abstract = {PMID: 25516281}, + language = {en}, + number = {12}, + urldate = {2015-06-10}, + journal = {Genome Biology}, + author = {Love, Michael I. and Huber, Wolfgang and Anders, Simon}, + month = dec, + year = {2014}, + pmid = {25516281}, + pages = {550}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/2CJB4PJW/Love et al. - 2014 - Moderated estimation of fold change and dispersion.pdf:application/pdf;Snapshot:/Users/steph/Documents/Zotero/storage/5TNWDT5A/550.html:text/html} +} + +@misc{broadinstitute_picard, + title = {Picard {Tools} - {By} {Broad} {Institute}}, + url = {http://broadinstitute.github.io/picard/}, + urldate = {2016-08-26} +} + +@misc{anshulkundaje_2014, + title = {(2014) mod/mouse/{humanENCODE}: {Blacklisted} genomic regions for functional genomics analysis - {Anshul} {Kundaje}}, + url = {https://sites.google.com/site/anshulkundaje/projects/blacklists}, + urldate = {2016-08-26}, + file = {(2014) mod/mouse/humanENCODE\: Blacklisted genomic regions for functional genomics analysis - Anshul Kundaje:/Users/steph/Documents/Zotero/storage/BD4QQH8K/blacklists.html:text/html} +} + +@article{langmead_ultrafast_2009, + title = {Ultrafast and memory-efficient alignment of short {DNA} sequences to the human genome}, + volume = {10}, + copyright = {2009 Langmead et al.; licensee BioMed Central Ltd.}, + issn = {1465-6906}, + url = {http://genomebiology.com/2009/10/3/R25/abstract}, + doi = {10.1186/gb-2009-10-3-r25}, + abstract = {Bowtie is an ultrafast, memory-efficient alignment program for aligning short DNA sequence reads to large genomes. For the human genome, Burrows-Wheeler indexing allows Bowtie to align more than 25 million reads per CPU hour with a memory footprint of approximately 1.3 gigabytes. Bowtie extends previous Burrows-Wheeler techniques with a novel quality-aware backtracking algorithm that permits mismatches. Multiple processor cores can be used simultaneously to achieve even greater alignment speeds. Bowtie is open source http://bowtie.cbcb.umd.edu.}, + language = {en}, + number = {3}, + urldate = {2013-07-11}, + journal = {Genome Biology}, + author = {Langmead, Ben and Trapnell, Cole and Pop, Mihai and Salzberg, Steven L.}, + month = mar, + year = {2009}, + pmid = {19261174}, + pages = {R25}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/UBJ7QSJI/Langmead et al. - 2009 - Ultrafast and memory-efficient alignment of short .pdf:application/pdf;Snapshot:/Users/steph/Documents/Zotero/storage/Q3UVVUK6/R25.html:text/html} +} + + +@article{encode_2012, + Annote = {10.1038/nature11247}, + Date = {2012/09/06/print}, + Date-Added = {2017-04-25 10:19:47 +0000}, + Date-Modified = {2017-04-25 10:19:47 +0000}, + Day = {06}, + Isbn = {0028-0836}, + Journal = {Nature}, + L3 = {http://www.nature.com/nature/journal/v489/n7414/abs/nature11247.html#supplementary-information}, + M3 = {10.1038/nature11247}, + Month = {09}, + Number = {7414}, + Pages = {57--74}, + Publisher = {Nature Publishing Group, a division of Macmillan Publishers Limited. All Rights Reserved.}, + Title = {An integrated encyclopedia of DNA elements in the human genome}, + Ty = {JOUR}, + Url = {http://dx.doi.org/10.1038/nature11247}, + Volume = {489}, + Year = {2012}, + Bdsk-Url-1 = {http://dx.doi.org/10.1038/nature11247}} + +@Article{AND2010, +author = {Anders and Huber}, +title = {Differential expression analysis for sequence count data}, +journal = {Genome Biology}, +year = {2010}, +volume = {11} +} + +@Article{BEN1995, +author = {Benjamini and Hochberg}, +title = {Controlling the false discovery rate: a practical and powerful approach to multiple testing}, +journal = {Journal of the Royal Statistical Society}, +year = {1995}, +volume = {57}, +pages = {289-300} +} + +@article{Love2014, + Abstract = {In comparative high-throughput sequencing assays, a fundamental task is the analysis of count data, such as read counts per gene in RNA-seq, for evidence of systematic changes across experimental conditions. Small replicate numbers, discreteness, large dynamic range and the presence of outliers require a suitable statistical approach. We present DESeq2, a method for differential analysis of count data, using shrinkage estimation for dispersions and fold changes to improve stability and interpretability of estimates. This enables a more quantitative analysis focused on the strength rather than the mere presence of differential expression. The DESeq2 package is available at http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html .}, + Author = {Love, Michael I. and Huber, Wolfgang and Anders, Simon}, + Doi = {10.1186/s13059-014-0550-8}, + Issn = {1474-760X}, + Journal = {Genome Biology}, + Number = {12}, + Pages = {550}, + Title = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2}, + Url = {http://dx.doi.org/10.1186/s13059-014-0550-8}, + Volume = {15}, + Year = {2014}, + Bdsk-Url-1 = {http://dx.doi.org/10.1186/s13059-014-0550-8}} + + +@article{mFuzz, + Abstract = { For the analysis of microarray data, clustering techniques are frequently used. Most of such methods are based on hard clustering of data wherein one gene (or sample) is assigned to exactly one cluster. Hard clustering, however, suffers from several drawbacks such as sensitivity to noise and information loss. In contrast, soft clustering methods can assign a gene to several clusters. They can overcome shortcomings of conventional hard clustering techniques and offer further advantages. Thus, we constructed an R package termed Mfuzz implementing soft clustering tools for microarray data analysis. The additional package Mfuzzgui provides a convenient TclTk based graphical user interface. AVAILABILITY: The R package Mfuzz and Mfuzzgui are available at http://itb1.biologie.hu-berlin.de/~futschik/software/R/Mfuzz/index.html. Their distribution is subject to GPL version 2 license.}, + An = {PMC2139991}, + Author = {Kumar, Lokesh and E. Futschik, Matthias}, + Date-Added = {2017-05-19 09:52:49 +0000}, + Date-Modified = {2017-05-19 09:52:49 +0000}, + Db = {PMC}, + Isbn = {0973-2063}, + J1 = {Bioinformation}, + Journal = {Bioinformation}, + Number = {1}, + Pages = {5--7}, + Publisher = {Biomedical Informatics Publishing Group}, + Title = {Mfuzz: A software package for soft clustering of microarray data}, + Ty = {JOUR}, + U1 = {000200022007{$[$}PII{$]$}; 18084642{$[$}pmid{$]$}}, + Url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2139991/}, + Volume = {2}, + Year = {2007}, + Year1 = {2007/05/20}, + Year2 = {2007/04/12/received}, + Year3 = {2007/05/01/accepted}, + Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2139991/}} + + +@article{chromhmm, + An = {PMC3577932}, + Author = {Ernst, Jason and Kellis, Manolis}, + Date = {2012/02/28}, + Date-Added = {2017-05-19 11:27:09 +0000}, + Date-Modified = {2017-05-19 11:27:09 +0000}, + Db = {PMC}, + Doi = {10.1038/nmeth.1906}, + Isbn = {1548-7091; 1548-7105}, + J1 = {Nat Methods}, + Journal = {Nature methods}, + Month = {02}, + Number = {3}, + Pages = {215--216}, + Title = {ChromHMM: automating chromatin state discovery and characterization}, + Ty = {JOUR}, + U1 = {22373907{$[$}pmid{$]$}}, + Url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3577932/}, + Volume = {9}, + Year = {2012}, + Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3577932/}, + Bdsk-Url-2 = {http://dx.doi.org/10.1038/nmeth.1906}} + +@article{mo_epigenomic_2016, + title = {Epigenomic landscapes of retinal rods and cones}, + volume = {5}, + copyright = {© 2016 Mo et al.. This article is distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use and redistribution provided that the original author and source are credited.}, + issn = {2050-084X}, + url = {https://elifesciences.org/articles/11613}, + doi = {10.7554/eLife.11613}, + abstract = {Genome-wide analysis of DNA methylation and accessible chromatin shows that retinal rods and cones have distinct epigenomic features that reflect differences in their development and function.}, + language = {en}, + urldate = {2017-12-04}, + journal = {eLife}, + author = {Mo, Alisa and Luo, Chongyuan and Davis, Fred P. and Mukamel, Eran A. and Henry, Gilbert L. and Nery, Joseph R. and Urich, Mark A. and Picard, Serge and Lister, Ryan and Eddy, Sean R. and Beer, Michael A. and Ecker, Joseph R. and Nathans, Jeremy}, + month = mar, + year = {2016}, + pages = {e11613}, + file = {Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/KF8WKU5G/Mo et al. - 2016 - Epigenomic landscapes of retinal rods and cones.pdf:application/pdf;Snapshot:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/DC5MPV3J/11613.html:text/html} +} + +@article{achour_neuronal_2015, + title = {Neuronal identity genes regulated by super-enhancers are preferentially down-regulated in the striatum of {Huntington}'s disease mice}, + volume = {24}, + issn = {0964-6906}, + url = {https://academic.oup.com/hmg/article/24/12/3481/623018/Neuronal-identity-genes-regulated-by-super}, + doi = {10.1093/hmg/ddv099}, + number = {12}, + urldate = {2017-06-13}, + journal = {Human Molecular Genetics}, + author = {Achour, Mayada and Le Gras, Stéphanie and Keime, Céline and Parmentier, Frédéric and Lejeune, François-Xavier and Boutillier, Anne-Laurence and Néri, Christian and Davidson, Irwin and Merienne, Karine}, + month = jun, + year = {2015}, + pages = {3481--3496}, + file = {Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/TCDC57AS/Achour et al. - 2015 - Neuronal identity genes regulated by super-enhance.pdf:application/pdf;Snapshot:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/JWHXSQDK/ddv099.html:text/html} +} + +%% Fimo +@article{grant_fimo:_2011, + title = {{FIMO:} scanning for occurrences of a given motif}, + volume = {27}, + issn = {1367-4803, 1460-2059}, + shorttitle = {{FIMO}}, + url = {http://bioinformatics.oxfordjournals.org/content/27/7/1017}, + doi = {10.1093/bioinformatics/btr064}, + abstract = {Summary: A motif is a short {DNA} or protein sequence that contributes to the biological function of the sequence in which it resides. Over the past several decades, many computational methods have been described for identifying, characterizing and searching with sequence motifs. Critical to nearly any motif-based sequence analysis pipeline is the ability to scan a sequence database for occurrences of a given motif described by a position-specific frequency matrix. +Results: We describe Find Individual Motif Occurrences ({FIMO)}, a software tool for scanning {DNA} or protein sequences with motifs described as position-specific scoring matrices. The program computes a log-likelihood ratio score for each position in a given sequence database, uses established dynamic programming methods to convert this score to a P-value and then applies false discovery rate analysis to estimate a q-value for each position in the given sequence. {FIMO} provides output in a variety of formats, including {HTML}, {XML} and several Santa Cruz Genome Browser formats. The program is efficient, allowing for the scanning of {DNA} sequences at a rate of 3.5 Mb/s on a single {CPU.} +Availability and Implementation: {FIMO} is part of the {MEME} Suite software toolkit. A web server and source code are available at http://meme.sdsc.edu. +Contact: t.bailey@imb.uq.edu.au; t.bailey@imb.uq.edu.au +Supplementary information: Supplementary data are available at Bioinformatics online.}, + language = {en}, + number = {7}, + urldate = {2013-07-11}, + journal = {Bioinformatics}, + author = {Grant, Charles E. and Bailey, Timothy L. and Noble, William Stafford}, + month = jan, + year = {2011}, + note = {{PMID:} 21330290}, + pages = {1017--1018}, + file = {Full Text PDF:/Users/steph/Documents/Zotero/storage/42F88CUZ/Grant et al. - 2011 - FIMO scanning for occurrences of a given motif.pdf:application/pdf;Snapshot:/Users/steph/Documents/Zotero/storage/B9TXDBFX/1017.html:text/html} +} + +%% meme-chip +@article{machanick_meme-chip:_2011, + title = {{MEME}-{ChIP}: motif analysis of large {DNA} datasets}, + volume = {27}, + issn = {1367-4803}, + shorttitle = {{MEME}-{ChIP}}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3106185/}, + doi = {10.1093/bioinformatics/btr189}, + abstract = {Motivation: Advances in high-throughput sequencing have resulted in rapid growth in large, high-quality datasets including those arising from transcription factor (TF) ChIP-seq experiments. While there are many existing tools for discovering TF binding site motifs in such datasets, most web-based tools cannot directly process such large datasets., Results: The MEME-ChIP web service is designed to analyze ChIP-seq ‘peak regions’—short genomic regions surrounding declared ChIP-seq ‘peaks’. Given a set of genomic regions, it performs (i) ab initio motif discovery, (ii) motif enrichment analysis, (iii) motif visualization, (iv) binding affinity analysis and (v) motif identification. It runs two complementary motif discovery algorithms on the input data—MEME and DREME—and uses the motifs they discover in subsequent visualization, binding affinity and identification steps. MEME-ChIP also performs motif enrichment analysis using the AME algorithm, which can detect very low levels of enrichment of binding sites for TFs with known DNA-binding motifs. Importantly, unlike with the MEME web service, there is no restriction on the size or number of uploaded sequences, allowing very large ChIP-seq datasets to be analyzed. The analyses performed by MEME-ChIP provide the user with a varied view of the binding and regulatory activity of the ChIP-ed TF, as well as the possible involvement of other DNA-binding TFs., Availability: MEME-ChIP is available as part of the MEME Suite at http://meme.nbcr.net., Contact: t.bailey@uq.edu.au, Supplementary information: Supplementary data are available at Bioinformatics online.}, + number = {12}, + journal = {Bioinformatics}, + author = {Machanick, Philip and Bailey, Timothy L.}, + month = jun, + year = {2011}, + pmid = {21486936}, + pmcid = {PMC3106185}, + pages = {1696--1697}, + file = {PubMed Central Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/236TPDJZ/Machanick et Bailey - 2011 - MEME-ChIP motif analysis of large DNA datasets.pdf:application/pdf} +} + +%% Bowtie 2 +@article{langmead_fast_2012, + title = {Fast gapped-read alignment with {Bowtie} 2}, + volume = {9}, + issn = {1548-7091}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3322381/}, + doi = {10.1038/nmeth.1923}, + abstract = {As the rate of sequencing increases, greater throughput is demanded from read aligners. The full-text minute index is often used to make alignment very fast and memory-efficient, but the approach is ill-suited to finding longer, gapped alignments. Bowtie 2 combines the strengths of the full-text minute index with the flexibility and speed of hardware-accelerated dynamic programming algorithms to achieve a combination of high speed, sensitivity and accuracy.}, + number = {4}, + journal = {Nature methods}, + author = {Langmead, Ben and Salzberg, Steven L}, + month = mar, + year = {2012}, + pmid = {22388286}, + pmcid = {PMC3322381}, + pages = {357--359} +} + +%% Samtools +@article{li_sequence_2009, + title = {The {Sequence} {Alignment}/{Map} format and {SAMtools}}, + volume = {25}, + issn = {1367-4803}, + url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2723002/}, + doi = {10.1093/bioinformatics/btp352}, + abstract = {Summary: The Sequence Alignment/Map (SAM) format is a generic alignment format for storing read alignments against reference sequences, supporting short and long reads (up to 128 Mbp) produced by different sequencing platforms. It is flexible in style, compact in size, efficient in random access and is the format in which alignments from the 1000 Genomes Project are released. SAMtools implements various utilities for post-processing alignments in the SAM format, such as indexing, variant caller and alignment viewer, and thus provides universal tools for processing read alignments., Availability: http://samtools.sourceforge.net, Contact: rd@sanger.ac.uk}, + number = {16}, + journal = {Bioinformatics}, + author = {Li, Heng and Handsaker, Bob and Wysoker, Alec and Fennell, Tim and Ruan, Jue and Homer, Nils and Marth, Gabor and Abecasis, Goncalo and Durbin, Richard}, + month = aug, + year = {2009}, + pmid = {19505943}, + pmcid = {PMC2723002}, + pages = {2078--2079}, + file = {PubMed Central Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/RS8TNV6R/Li et al. - 2009 - The Sequence AlignmentMap format and SAMtools.pdf:application/pdf} +} + +%% Atac-seq +@article{buenrostro_transposition_2013, + title = {Transposition of native chromatin for fast and sensitive epigenomic profiling of open chromatin, {DNA}-binding proteins and nucleosome position}, + volume = {10}, + issn = {1548-7091}, + url = {http://dx.doi.org/10.1038/nmeth.2688}, + abstract = {We describe an assay for transposase-accessible chromatin using sequencing (ATAC-seq), based on direct in vitro transposition of sequencing adaptors into native chromatin, as a rapid and sensitive method for integrative epigenomic analysis. ATAC-seq captures open chromatin sites using a simple two-step protocol with 500-50,000 cells and reveals the interplay between genomic locations of open chromatin, DNA-binding proteins, individual nucleosomes and chromatin compaction at nucleotide resolution. We discovered classes of DNA-binding factors that strictly avoided, could tolerate or tended to overlap with nucleosomes. Using ATAC-seq maps of human CD4+ T cells from a proband obtained on consecutive days, we demonstrated the feasibility of analyzing an individual's epigenome on a timescale compatible with clinical decision-making.}, + number = {12}, + journal = {Nat Meth}, + author = {Buenrostro, Jason D and Giresi, Paul G and Zaba, Lisa C and Chang, Howard Y and Greenleaf, William J}, + month = dec, + year = {2013}, + pages = {1213--1218} +} + +%% deeptools +@article{ramirez_deeptools2:_2016, + title = {{deepTools}2: a next generation web server for deep-sequencing data analysis}, + volume = {44}, + issn = {0305-1048}, + shorttitle = {{deepTools}2}, + url = {https://academic.oup.com/nar/article/44/W1/W160/2499308/deepTools2-a-next-generation-web-server-for-deep}, + doi = {10.1093/nar/gkw257}, + abstract = {We present an update to our Galaxy-based web server for processing and visualizing deeply sequenced data. Its core tool set, deepTools, allows users to perform complete bioinformatic workflows ranging from quality controls and normalizations of aligned reads to integrative analyses, including clustering and visualization approaches. Since we first described our deepTools Galaxy server in 2014, we have implemented new solutions for many requests from the community and our users. Here, we introduce significant enhancements and new tools to further improve data visualization and interpretation. deepTools continue to be open to all users and freely available as a web service at deeptools.ie-freiburg.mpg.de. The new deepTools2 suite can be easily deployed within any Galaxy framework via the toolshed repository, and we also provide source code for command line usage under Linux and Mac OS X. A public and documented API for access to deepTools functionality is also available.}, + number = {W1}, + urldate = {2017-10-10}, + journal = {Nucleic Acids Research}, + author = {Ramírez, Fidel and Ryan, Devon P. and Grüning, Björn and Bhardwaj, Vivek and Kilpert, Fabian and Richter, Andreas S. and Heyne, Steffen and Dündar, Friederike and Manke, Thomas}, + month = jul, + year = {2016}, + pages = {W160--W165}, + file = {Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/J2VMQWUX/Ramírez et al. - 2016 - deepTools2 a next generation web server for deep-.pdf:application/pdf;Snapshot:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/AHICP59C/deepTools2-a-next-generation-web-server-for-deep.html:text/html} +} + +%% bedtools +@article{quinlan_bedtools:_2010, + title = {{BEDTools}: a flexible suite of utilities for comparing genomic features}, + volume = {26}, + issn = {1367-4803}, + shorttitle = {{BEDTools}}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2832824/}, + doi = {10.1093/bioinformatics/btq033}, + abstract = {Motivation: Testing for correlations between different sets of genomic features is a fundamental task in genomics research. However, searching for overlaps between features with existing web-based methods is complicated by the massive datasets that are routinely produced with current sequencing technologies. Fast and flexible tools are therefore required to ask complex questions of these data in an efficient manner., Results: This article introduces a new software suite for the comparison, manipulation and annotation of genomic features in Browser Extensible Data (BED) and General Feature Format (GFF) format. BEDTools also supports the comparison of sequence alignments in BAM format to both BED and GFF features. The tools are extremely efficient and allow the user to compare large datasets (e.g. next-generation sequencing data) with both public and custom genome annotation tracks. BEDTools can be combined with one another as well as with standard UNIX commands, thus facilitating routine genomics tasks as well as pipelines that can quickly answer intricate questions of large genomic datasets., Availability and implementation: BEDTools was written in C++. Source code and a comprehensive user manual are freely available at http://code.google.com/p/bedtools, Contact: aaronquinlan@gmail.com; imh4y@virginia.edu, Supplementary information: Supplementary data are available at Bioinformatics online.}, + number = {6}, + journal = {Bioinformatics}, + author = {Quinlan, Aaron R. and Hall, Ira M.}, + month = mar, + year = {2010}, + pmid = {20110278}, + pmcid = {PMC2832824}, + pages = {841--842}, + file = {PubMed Central Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/P26FE2CK/Quinlan et Hall - 2010 - BEDTools a flexible suite of utilities for compar.pdf:application/pdf} +} + +%% Diffbind +@article{ross-innes_differential_2012, + title = {Differential oestrogen receptor binding is associated with clinical outcome in breast cancer}, + volume = {481}, + copyright = {2012 Nature Publishing Group}, + issn = {1476-4687}, + url = {https://www.nature.com/articles/nature10730}, + doi = {10.1038/nature10730}, + abstract = {{\textless}p{\textgreater}Genome-wide mapping of oestrogen receptor-α binding sites in primary breast cancer tissues shows that oestrogen receptor binding is dynamically regulated and that the expression of genes near differentially bound regulatory regions is associated with clinical outcome.{\textless}/p{\textgreater}}, + language = {En}, + number = {7381}, + urldate = {2018-01-12}, + journal = {Nature}, + author = {Ross-Innes, Caryn S. and Stark, Rory and Teschendorff, Andrew E. and Holmes, Kelly A. and Ali, H. Raza and Dunning, Mark J. and Brown, Gordon D. and Gojis, Ondrej and Ellis, Ian O. and Green, Andrew R. and Ali, Simak and Chin, Suet-Feung and Palmieri, Carlo and Caldas, Carlos and Carroll, Jason S.}, + month = jan, + year = {2012}, + pages = {389}, + file = {Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/S4IF4A2Q/Ross-Innes et al. - 2012 - Differential oestrogen receptor binding is associa.pdf:application/pdf;Snapshot:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/77XDUB46/nature10730.html:text/html} +} + +@article{martin_cutadapt_2011, + title = {Cutadapt removes adapter sequences from high-throughput sequencing reads}, + volume = {17}, + copyright = {Authors who publish with this journal agree to the following terms: Authors retain copyright and grant the journal right of first publication with the work simultaneously licensed under a Creative Commons Attribution License that allows others to share the work with an acknowledgement of the work's authorship and initial publication in this journal. Authors are able to enter into separate, additional contractual arrangements for the non-exclusive distribution of the journal's published version of the work (e.g., post it to an institutional repository or publish it in a book), with an acknowledgement of its initial publication in this journal. Authors are permitted and encouraged to post their work online (e.g., in institutional repositories or on their website) prior to and during the submission process, as it can lead to productive exchanges, as well as earlier and greater citation of published work (See The Effect of Open Access ).}, + issn = {2226-6089}, + url = {http://journal.embnet.org/index.php/embnetjournal/article/view/200}, + language = {en}, + number = {1}, + urldate = {2017-10-06}, + journal = {EMBnet.journal}, + author = {Martin, Marcel}, + month = may, + year = {2011}, + keywords = {adapter removal, microRNA, next generation sequencing, small RNA}, + pages = {pp. 10--12}, + file = {Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/TP2MNUNP/Martin - 2011 - Cutadapt removes adapter sequences from high-throu.pdf:application/pdf;Snapshot:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/687E34MS/479.html:text/html} +} + +%% Blacklist +@article{amemiya_encode_2019, + title = {The {ENCODE} {Blacklist}: {Identification} of {Problematic} {Regions} of the {Genome}}, + volume = {9}, + copyright = {2019 The Author(s)}, + issn = {2045-2322}, + shorttitle = {The {ENCODE} {Blacklist}}, + url = {https://www.nature.com/articles/s41598-019-45839-z}, + doi = {10.1038/s41598-019-45839-z}, + abstract = {Functional genomics assays based on high-throughput sequencing greatly expand our ability to understand the genome. Here, we define the ENCODE blacklist- a comprehensive set of regions in the human, mouse, worm, and fly genomes that have anomalous, unstructured, or high signal in next-generation sequencing experiments independent of cell line or experiment. The removal of the ENCODE blacklist is an essential quality measure when analyzing functional genomics data.}, + language = {en}, + number = {1}, + urldate = {2021-02-15}, + journal = {Scientific Reports}, + author = {Amemiya, Haley M. and Kundaje, Anshul and Boyle, Alan P.}, + month = jun, + year = {2019}, + note = {Number: 1 +Publisher: Nature Publishing Group}, + pages = {9354}, + file = {Full Text PDF:/Users/slegras/Zotero/storage/86B77NZE/Amemiya et al. - 2019 - The ENCODE Blacklist Identification of Problemati.pdf:application/pdf;Snapshot:/Users/slegras/Zotero/storage/ADBG3PBV/s41598-019-45839-z.html:text/html}, +} + +@article{dobin_star:_2013, + title = {{STAR}: ultrafast universal {RNA}-seq aligner}, + volume = {29}, + issn = {1367-4803}, + shorttitle = {{STAR}}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/}, + doi = {10.1093/bioinformatics/bts635}, + abstract = {Motivation: Accurate alignment of high-throughput RNA-seq data is a challenging and yet unsolved problem because of the non-contiguous transcript structure, relatively short read lengths and constantly increasing throughput of the sequencing technologies. Currently available RNA-seq aligners suffer from high mapping error rates, low mapping speed, read length limitation and mapping biases., Results: To align our large ({\textgreater}80 billon reads) ENCODE Transcriptome RNA-seq dataset, we developed the Spliced Transcripts Alignment to a Reference (STAR) software based on a previously undescribed RNA-seq alignment algorithm that uses sequential maximum mappable seed search in uncompressed suffix arrays followed by seed clustering and stitching procedure. STAR outperforms other aligners by a factor of {\textgreater}50 in mapping speed, aligning to the human genome 550 million 2 × 76 bp paired-end reads per hour on a modest 12-core server, while at the same time improving alignment sensitivity and precision. In addition to unbiased de novo detection of canonical junctions, STAR can discover non-canonical splices and chimeric (fusion) transcripts, and is also capable of mapping full-length RNA sequences. Using Roche 454 sequencing of reverse transcription polymerase chain reaction amplicons, we experimentally validated 1960 novel intergenic splice junctions with an 80–90\% success rate, corroborating the high precision of the STAR mapping strategy., Availability and implementation: STAR is implemented as a standalone C++ code. STAR is free open source software distributed under GPLv3 license and can be downloaded from http://code.google.com/p/rna-star/., Contact: +dobin@cshl.edu.}, + number = {1}, + journal = {Bioinformatics}, + author = {Dobin, Alexander and Davis, Carrie A. and Schlesinger, Felix and Drenkow, Jorg and Zaleski, Chris and Jha, Sonali and Batut, Philippe and Chaisson, Mark and Gingeras, Thomas R.}, + month = jan, + year = {2013}, + pmid = {23104886}, + pmcid = {PMC3530905}, + pages = {15--21}, + file = {PubMed Central Full Text PDF:/Users/slegras/Zotero/storage/J4M7GM4B/Dobin et al. - 2013 - STAR ultrafast universal RNA-seq aligner.pdf:application/pdf} +} + +%% Samtools +@article{li_sequence_2009, + title = {The {Sequence} {Alignment}/{Map} format and {SAMtools}}, + volume = {25}, + issn = {1367-4803}, + url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2723002/}, + doi = {10.1093/bioinformatics/btp352}, + abstract = {Summary: The Sequence Alignment/Map (SAM) format is a generic alignment format for storing read alignments against reference sequences, supporting short and long reads (up to 128 Mbp) produced by different sequencing platforms. It is flexible in style, compact in size, efficient in random access and is the format in which alignments from the 1000 Genomes Project are released. SAMtools implements various utilities for post-processing alignments in the SAM format, such as indexing, variant caller and alignment viewer, and thus provides universal tools for processing read alignments., Availability: http://samtools.sourceforge.net, Contact: rd@sanger.ac.uk}, + number = {16}, + journal = {Bioinformatics}, + author = {Li, Heng and Handsaker, Bob and Wysoker, Alec and Fennell, Tim and Ruan, Jue and Homer, Nils and Marth, Gabor and Abecasis, Goncalo and Durbin, Richard}, + month = aug, + year = {2009}, + pmid = {19505943}, + pmcid = {PMC2723002}, + pages = {2078--2079}, + file = {PubMed Central Full Text PDF:/Users/slegras/Library/Application Support/Zotero/Profiles/l2r21qzc.default/zotero/storage/RS8TNV6R/Li et al. - 2009 - The Sequence AlignmentMap format and SAMtools.pdf:application/pdf} +} + +@Article{chipseeker, + title = {ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization}, + author = {Guangchuang Yu and Li-Gen Wang and Qing-Yu He}, + journal = {Bioinformatics}, + year = {2015}, + volume = {31}, + number = {14}, + pages = {2382-2383}, + pmid = {25765347}, + doi = {10.1093/bioinformatics/btv145}, + } + +@Article{clusterprofiler, + title = {clusterProfiler: an R package for comparing biological themes among gene clusters}, + author = {Guangchuang Yu and Li-Gen Wang and Yanyan Han and Qing-Yu He}, + journal = {OMICS: A Journal of Integrative Biology}, + year = {2012}, + volume = {16}, + number = {5}, + pages = {284-287}, + pmid = {22455463}, + doi = {10.1089/omi.2011.0118}, +} diff --git a/2024/ebaiin1/chip-seq/images/1_GEO.png b/2024/ebaiin1/chip-seq/images/1_GEO.png new file mode 100644 index 0000000..72eb29b Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/1_GEO.png differ diff --git a/2024/ebaiin1/chip-seq/images/1_SRA.png b/2024/ebaiin1/chip-seq/images/1_SRA.png new file mode 100644 index 0000000..0b7e27e Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/1_SRA.png differ diff --git a/2024/ebaiin1/chip-seq/images/2_GEO.png b/2024/ebaiin1/chip-seq/images/2_GEO.png new file mode 100644 index 0000000..5acaec6 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/2_GEO.png differ diff --git a/2024/ebaiin1/chip-seq/images/2_SRA.png b/2024/ebaiin1/chip-seq/images/2_SRA.png new file mode 100644 index 0000000..698434e Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/2_SRA.png differ diff --git a/2024/ebaiin1/chip-seq/images/3.1_ENA.png b/2024/ebaiin1/chip-seq/images/3.1_ENA.png new file mode 100644 index 0000000..216f0d1 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/3.1_ENA.png differ diff --git a/2024/ebaiin1/chip-seq/images/3_GEO.png b/2024/ebaiin1/chip-seq/images/3_GEO.png new file mode 100644 index 0000000..c767a90 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/3_GEO.png differ diff --git a/2024/ebaiin1/chip-seq/images/3_SRA.png b/2024/ebaiin1/chip-seq/images/3_SRA.png new file mode 100644 index 0000000..3837483 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/3_SRA.png differ diff --git a/2024/ebaiin1/chip-seq/images/4_EBI.png b/2024/ebaiin1/chip-seq/images/4_EBI.png new file mode 100644 index 0000000..3b4db5b Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/4_EBI.png differ diff --git a/2024/ebaiin1/chip-seq/images/5_EBI.png b/2024/ebaiin1/chip-seq/images/5_EBI.png new file mode 100644 index 0000000..4a11514 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/5_EBI.png differ diff --git a/2024/ebaiin1/chip-seq/images/6_Genomes.png b/2024/ebaiin1/chip-seq/images/6_Genomes.png new file mode 100644 index 0000000..e7ffe51 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/6_Genomes.png differ diff --git "a/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 12.35.33.png" "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 12.35.33.png" new file mode 100644 index 0000000..0608775 Binary files /dev/null and "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 12.35.33.png" differ diff --git "a/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 12.36.08.png" "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 12.36.08.png" new file mode 100644 index 0000000..59be324 Binary files /dev/null and "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 12.36.08.png" differ diff --git "a/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 13.10.02.png" "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 13.10.02.png" new file mode 100644 index 0000000..8ad3dfd Binary files /dev/null and "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 13.10.02.png" differ diff --git "a/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 13.10.14.png" "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 13.10.14.png" new file mode 100644 index 0000000..47988a8 Binary files /dev/null and "b/2024/ebaiin1/chip-seq/images/Capture d\342\200\231\303\251cran 2017-11-10 \303\240 13.10.14.png" differ diff --git a/2024/ebaiin1/chip-seq/images/IGVbam.png b/2024/ebaiin1/chip-seq/images/IGVbam.png new file mode 100644 index 0000000..02a8528 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/IGVbam.png differ diff --git a/2024/ebaiin1/chip-seq/images/fastqc.png b/2024/ebaiin1/chip-seq/images/fastqc.png new file mode 100644 index 0000000..5e9810a Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/fastqc.png differ diff --git a/2024/ebaiin1/chip-seq/images/jupyterHub_profile.png b/2024/ebaiin1/chip-seq/images/jupyterHub_profile.png new file mode 100644 index 0000000..68dd2a7 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/jupyterHub_profile.png differ diff --git a/2024/ebaiin1/chip-seq/images/jupyterlabForm.png b/2024/ebaiin1/chip-seq/images/jupyterlabForm.png new file mode 100644 index 0000000..3aa40ce Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/jupyterlabForm.png differ diff --git a/2024/ebaiin1/chip-seq/images/launchJupyterhub.png b/2024/ebaiin1/chip-seq/images/launchJupyterhub.png new file mode 100644 index 0000000..8a7213b Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/launchJupyterhub.png differ diff --git a/2024/ebaiin1/chip-seq/images/launchRstudio.png b/2024/ebaiin1/chip-seq/images/launchRstudio.png new file mode 100644 index 0000000..887770c Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/launchRstudio.png differ diff --git a/2024/ebaiin1/chip-seq/images/selectJupyterLab.png b/2024/ebaiin1/chip-seq/images/selectJupyterLab.png new file mode 100644 index 0000000..9512a19 Binary files /dev/null and b/2024/ebaiin1/chip-seq/images/selectJupyterLab.png differ