-
Notifications
You must be signed in to change notification settings - Fork 0
/
bioconductor2.R
3688 lines (2218 loc) · 96.3 KB
/
bioconductor2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#----------------------### 01 SHOWING UP : Biostrings exercises###----------------------------------------------------
### exercise from --> https://www.r-exercises.com/2017/01/22/bioinformatics-lessons-in-r-1-biostrings-exercises/
myDNA <- c("ATGTTGCATTCATTAATTAAGAACGACCCAATACA")
myDNA
mySequencing <- c("CTGATTT-GATGGTC-NAT")
mySequencing
length(mySequencing)
myDNASeq <- DNAString("CTGATTT-GATGGTC-NAT")
myDNASeq
length(myDNASeq)
class(myDNA)
class(myDNASeq)
pastedDNA<- paste(myDNA, myDNASeq)
class(pastedDNA)
DNAString(pastedDNA) # it doesn't work
#But here :
pastedDNA<- paste(myDNA, myDNASeq, sep = "")
DNAString(pastedDNA) # it works
myDNASeq
complement(myDNASeq)
reverse(myDNASeq)
reverseComplement(myDNASeq)
alphabetFrequency(myDNASeq)
translate(myDNASeq)
#__https://www.r-exercises.com/2017/05/28/manipulate-biological-data-using-biostrings-package-exercises-part-2/
# Sequence Alignment Techniques:
#Sequence Alignment is comparing the similarity between the sequences to check how much the DNA,RNA or Protein are related to each other.
#There are three types of Sequence Alignment:
#1. Global Alignment
#2. Local Alignment
#3. Overlap Alignment
lsf.str("package:Biostrings")
myd<- DNAString("CTGATTTCA")
complement(myd)
myr<- RNAString("CUGAUUUCA")
myr
reverseComplement(myd)
translate(RNAString("CUGAUUUCA"))
abc <- translate(RNAString("CUGAUUUCA"))
#Print the three letter of each amino acid:
Amino<-AMINO_ACID_CODE[strsplit(as.character(abc), NULL)[[1]]]
Amino
#N.B: strsplit() splits the sequences in a set of sequences according to a
#pattern.
DNAString(myDNA)
translate(DNAString(myDNA))
AMINO_ACID_CODE[strsplit(as.character(translate(DNAString(myDNA))), NULL)[[1]]]
# YOU'RE KILLING IT MAN ! KEEP GRINDING ! IT'LL BE WORTH IT !
# Frequency of each amino acid:
table(AMINO_ACID_CODE[strsplit(as.character(translate(DNAString(myDNA))), NULL)[[1]]] )
# Better visualization of that :
plot( table(AMINO_ACID_CODE[strsplit(as.character(translate(DNAString(myDNA))), NULL)[[1]]] ) )
# Global Alignment technique:
d1 <- DNAString("ATCGGAAGTT")
d2 <- DNAString("AGCTGAACCG")
PairwiseAlignmentsSingleSubject(d1, d2)
# Global Alignment technique after specifying your own score for match and mismatch among the sequence:
d1 <- DNAString("ATCGGAAGTT")
d2 <- DNAString("AGCTGAACCG")
matrix <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
pairwiseAlignment(d1, d2,substitutionMatrix = matrix)
# Local Alignment technique:
d1 <- DNAString("ATCGGAAGTT")
d2 <- DNAString("AGCTGAACCG")
matrix <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3)
pairwiseAlignment(d1, d2,substitutionMatrix = matrix, type="local")
# Overlap Alignment technique
d1 <- DNAString("ATCGGAAGTT")
d2 <- DNAString("AGCTGAACCG")
matrix <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3)
pairwiseAlignment(d1, d2,substitutionMatrix = matrix,type="overlap")
#-------------------------------------------------------------------------------------
# Exercise from --> https://compgenomr.github.io/book/exercises-4.html
library(rtracklayer)
library(AnnotationHub)
ahub <- AnnotationHub()
ahub
# Get hold of CpG islands from chr12 mouse mm9
fil <- subset(ahub, species == "Mus musculus")
fil
fjl <- query(fil, c("mm9", "UCSC", "CpG islands"))
fjl
CpG <- fjl[["AH6117"]] #Granges
CpG
seqlevels(CpG)
CpG_chr12 <- keepSeqlevels(CpG, "chr12",pruning.mode = "coarse")
CpG_chr12
mean(width(CpG_chr12))
range(width(CpG_chr12))
# Get hold of RefSeq genes from chr12 mouse mm9
fkl <- query(fil, c("mm9", "UCSC", "refGene"))
fkl
Refseq <- fkl[["AH6090"]]
Refseq
seqlevels(Refseq)
Rfsq_chr12 <- keepSeqlevels(Refseq, "chr12",pruning.mode = "coarse")
Rfsq_chr12
args(promoters)
prrfsq <- promoters(Rfsq_chr12, 1000, 1000 )
prrfsq
subsetByOverlaps(prrfsq, CpG_chr12)
# 2271 promoters that overlap CpG
length((subsetByOverlaps(prrfsq, CpG_chr12))$name)
length (prrfsq) # 5198 promoters
length (CpG_chr12) # 627 CpG
# Percentage of promoters which overlap with CpG islands
2271/5198 #= 0.44 = 44 %
subsetByOverlaps(CpG_chr12, prrfsq)
# Percentage of CpG islands overlap with promoters
a <- 425/627
b<- round(a, 2)
c <- b*100
c # 68 %
CpGoverlap <- subsetByOverlaps(CpG_chr12, prrfsq)
CpGoverlap
width(CpGoverlap)
plot( width(CpGoverlap))
#---- 02 SHOWING UP : Enrichment CpG, Promotors, TSS...---------------------------------
BiocManager::install("genomation")
library(genomation)
library("AnnotationHub")
ahub = AnnotationHub()
#______How transcription factor binding sites can be annotated using CpG islands
qhub = query(ahub, c("wgEncodeHaibTfbsGm12878Sp1Pcr1xPkRep1.broadPeak.gz"))
qhub
peaks <- qhub[["AH24939"]]
peaks # = TF binding sites / = transcription start sites (TSS)
# How far are promoters from TSS: Promoters are upstream of TSS.
# The majority of mammalian gene promoters are encompassed within regions
#of the genome called CpG islands that have an elevated level of
#non-methylated CpG dinucleotides.
# In vertebrates, DNA is pervasively methylated at CG dinucleotides, a modification
#that is repressive to transcription. However, approximately 70% of vertebrate gene
#promoters are associated with CpG islands (CGIs) that are refractory
#to DNA methylation.
peaks21 <- keepSeqlevels(peaks, "chr21",pruning.mode = "coarse")
peaks21
# 620 peaks (TSS)
qhub2 = query(ahub, c("cpgi", "hg19")) # Get CpG
qhub2
cpg <- qhub2[["AH5086"]]
cpg
cpg21 <- keepSeqlevels(cpg, "chr21",pruning.mode = "coarse")
cpg21
# 365 CpG
subsetByOverlaps(peaks21, cpg21)
subsetByOverlaps(cpg21, peaks21)
# 55 :
# 15% of CpG
# ~9 % of peaks
length(subsetByOverlaps(cpg21, peaks21, ignore.strand = TRUE))/ length(cpg21)
# 15%
length(subsetByOverlaps(peaks21, cpg21, ignore.strand = TRUE))/ length(peaks21)
# ~ 9%
# Check for each peak if it overlaps with a CpG:
countOverlaps(peaks21, cpg21)
#N.B. tabulates the number of overlaps for each element in the query.
# Doing it the other way around
countOverlaps(cpg21, peaks21)
# see one-to-one overlaps between peaks and CpG islands.
#It returns a matrix showing which peak overlaps which CpG island.
Overl <- findOverlaps(peaks21, cpg21)
Overl
mean(width(peaks21)) # 685
range(width(peaks21)) # 216 4319
mean(width(cpg21)) # 734
range(width(cpg21)) # 202 5698
which( (countOverlaps(peaks21, cpg21)) >= 1) # queryHits in overl
which( (countOverlaps(cpg21, peaks21)) >= 1) # subjectHits in overl
sum(countOverlaps(cpg21, peaks21)) #56 overlaps
sum(countOverlaps(peaks21, cpg21)) #56
which( (countOverlaps(peaks21,cpg21)) == 2) #462th position
# look at [9] & [10] : the same peak (n°462) overlaps with 2 different CpG.
Overl[15:25]
peaks21[462]
width(peaks21[462]) # 1062 bp
cpg21[93:94]
width(cpg21[93:94]) # 381 bp ; 969 bp
# " peak14901 " overlap with " CpG:_33 " and " CpG:_105 ".
# There's 56 overlaps but why subsetByOverlaps() shows only 55 ?
# Is because :
#subsetByOverlaps() returns a subsetted query containing only those
#elements overlapped by at least one element in subject
cpg21[12]
which(cpg21$name== "CpG:_33") #58 (93) 320
which(cpg21$name== "CpG:_105") # 94
length(unique(cpg21$name)) # Only 141 CpG !!!! /365
cpg21[c(58, 93, 320)]
length(unique(peaks21$name)) # 620 /620
cpg21[12] # why do we have 2 overlaps here??
Overl[4:20] # CpG in the 12th position overlaps with 2 different peaks
which(countOverlaps(cpg21, peaks21) == 2 ) # 12
width(cpg21[12]) # 487 bp
width(peaks[250:251]) # 397 bp ; 707 bp
#*** find nearest CpG to each peak (TSS):
nearest(peaks21, cpg21)
distanceToNearest(peaks21, cpg21)
distanceToNearest(peaks21, cpg21)[462] # distance O !
###__________________ TSS and CpG for autosomes :
TSS <- peaks
CpG <- cpg
granges(TSS)$chr
TSS_aut <- keepSeqlevels( TSS, (paste0("chr", 1:22)), pruning.mode = "coarse" )
TSS_aut
table(seqnames(TSS_aut))
CpG_aut <- keepSeqlevels( CpG, (paste0("chr", 1:22)), pruning.mode = "coarse" )
CpG_aut
fovr <- findOverlaps(TSS_aut, CpG_aut)
fovr # 5207 overlaps in total
sovr <- subsetByOverlaps(TSS_aut,CpG_aut , ignore.strand = TRUE)
sovr # 5156 of TSS & 4938 of CpG
which (countOverlaps(TSS_aut, CpG_aut) == 2) # 51 ( max overlaps = 2 )
# 5156 + 51 = 5207
length(subsetByOverlaps(TSS_aut, CpG_aut, ignore.strand = TRUE))/ length(TSS_aut)
# 20 % of TSS have a CpG
length(subsetByOverlaps(CpG_aut, TSS_aut, ignore.strand = TRUE))/ length(CpG_aut)
# 18 % of CpG are located inside TSS
# Is there any significant relationship between the two :
inOut <- matrix(0, ncol= 2, nrow = 2)
colnames(inOut) <- c("in", "out")
rownames(inOut) <- c("in", "out")
inOut
inOut[1,1] <- sum(width(intersect(TSS_aut, CpG_aut, ignore.strand = TRUE)))
inOut[1,2] <- sum(width(setdiff(TSS_aut,CpG_aut , ignore.strand = TRUE)))
inOut[2,1] <- sum(width(setdiff(CpG_aut, TSS_aut , ignore.strand = TRUE)))
inOut
inOut[2,2] = 3*10^9 - sum(inOut)
inOut
fisher.test(inOut)$statistic #ERROR
oddsRatio = inOut[1,1] * inOut[2,2] / (inOut[2,1] * inOut[1,2] )
oddsRatio # = 26 : (>1) there's a strong relationship = significant enrichment.
# FROM LITERATURE:
# CpG islands typically occur at or near the transcription start site of genes.
# CpG islands often include transcription initiation sites and display 'active' histone marks,
#notably histone H3 lysine 4 methylation ( H3K4me3 ).
###_____________ How promoters of refseq genes are enriched in these CpG islands ?
#1)
cpg
qhub3 = query(ahub, "RefSeq")
qhub3
genes= qhub3[[1]]
genes # SRY gene: NM_003140 (human) & NM_011564 (mouse) from UCSC
which(genes$name== "NM_003140" ) # WOOW ---> 24792
genes[24792 ]
width(genes[24792 ]) # 887 bp
seqlevels(cpg)
seqlevels(genes)
seqlevels(cpg) == seqlevels(genes) # We have the same seqlevels.
cpgstX <- keepStandardChromosomes(cpg, pruning.mode = "coarse")
cpgstX
genesstX <- keepStandardChromosomes(genes, pruning.mode = "coarse")
genesstX
seqlengths(cpgstX)/10^6
seqlengths(genesstX)/10^6
# Remove mitDNA
cpg_1Y <- dropSeqlevels(cpgstX, "chrM", pruning.mode = "coarse" )
cpg_1Y
seqlengths(cpg_1Y)
genes_1Y <- dropSeqlevels(genesstX, "chrM", pruning.mode = "coarse" )
genes_1Y
seqlengths(genes_1Y)
# N.B:
# chromosome M (chrM) = Mitochondrial DNA = 16500 base pairs
# ChrUn contains clone contigs that cannot
#be confidently placed on a specific chromosome.
width(cpg_1Y)
range(width(cpg_1Y)) # 201 bp ; 45712 bp
mean(width(cpg_1Y)) # 763 bp
width(genes_1Y)
range(width(genes_1Y)) # 20 bp ; 2320934 bp
mean(width(genes_1Y)) # 58336 bp
which(width(genes_1Y) == 20)
(genes_1Y)[c(17970, 18654, 18655, 29283, 32890)] #genes with only 20 bp
# How many transcripts we've got here ?
length(genes_1Y$name) # 47460 transcripts
length(unique(genes_1Y$name)) # Only 46308 genes ( - 1152 )
# Find out which genes have several transcripts:
# how many transcripts in all genes ?
table(table(genes_1Y$name ))
# Here are the 500 genes who have 2 different transcripts:
length(which(table(genes_1Y$name ) == 2 ) )
# e.g., this gene has 2 transcripts located in 13191 and 13199
which((genes_1Y$name) == "NM_000344" )
# We're gonna ask if these Promoters are enriched in CpG
length( promoters(genes_1Y) ) # 47,460 total promoters
length( cpg_1Y ) # 27,718 total CpG
prom_gen1Y <- promoters(genes_1Y)
prom_gen1Y
fov <- findOverlaps(prom_gen1Y, cpg_1Y)
fov # 30,973 overlaps
cov <- countOverlaps(prom_gen1Y, cpg_1Y)
cov
sum(cov) # 30,973 overlaps
(queryHits(fov))[ 561]
cov[561]
prom_gen1Y[762]
sov <- subsetByOverlaps(prom_gen1Y, cpg_1Y )
sov #29534
# Nbr of promoters that have a CpG in them :
length(unique(queryHits(fov))) # 29534 promoters
29534/ 47460*100
# 62 % of promoters have a CpG in them.
#** That makes sense 'cause:
#In humans, about 70% of promoters located near the TSS
# of a gene (proximal promoters) contain a CpG island.
# number of CpG that have a promoter in them.
length(unique(subjectHits(fov))) # 14077 CpG
14077/ 27718*100 # 50 % of CpG have a promoter in them.
# How many Megabases do the promoters call ?
sum(width(reduce(prom_gen1Y, ignore.strand = TRUE)))
# 59.44159 Mbp
# How many Megabases do the CpGs call ?
sum(width(reduce(cpg_1Y, ignore.strand = TRUE)))
# 21.15239 Mbp
# How big is the overlap ?
sum(width(intersect(prom_gen1Y, cpg_1Y, ignore.strand = TRUE)))/ 10^6
# 8.392124 Mbp
distanceToNearest(prom_gen1Y, cpg_1Y)
#explain that sh*t :
sum(width(prom_gen1Y)) /10^6 # 104.412 Mbp !!!!
sum(width(reduce(prom_gen1Y)))/10^6 # 61.23916 Mbp !!!!
# is 'cause some genes have many transcripts.
### THE RIGHT WAY TO DO IT :
sum(width(reduce(prom_gen1Y, ignore.strand = TRUE)))/10^6 # 59.44159 Mbp !!!!
#N.B.:
# ignore.strand = TRUE allows elements on the + strand
#to overlap elements on the - strand.
#ignore.strand When set to TRUE, the strand information is ignored in the overlap calculations.
# For set operations: If set to TRUE, then the strand of x and y is set to "*" prior to any computation.
# For parallel set operations: If set to TRUE, the strand information is ignored in the computation and the result has the strand information of x.
#Compute the Odd ratio:
# Is a measure of association. It quantifies the relationship.
inOut <- matrix(0, ncol= 2, nrow = 2)
colnames(inOut) <- c("in", "out")
rownames(inOut) <- c("in", "out")
inOut
inOut[1,1] <- sum(width(intersect(prom_gen1Y, cpg_1Y, ignore.strand = TRUE)))
inOut[1,2] <- sum(width(setdiff(prom_gen1Y,cpg_1Y , ignore.strand = TRUE)))
inOut[2,1] <- sum(width(setdiff(cpg_1Y, prom_gen1Y , ignore.strand = TRUE)))
inOut
inOut[2,2] = 3*10^9 - sum(inOut)
inOut
fisher.test(inOut)$statistic #ERROR
oddsRatio = inOut[1,1] * inOut[2,2] / (inOut[2,1] * inOut[1,2] )
oddsRatio # = 37 ; (>1) there's a significant enrichment.
##_____________________ what about CpG islands & H3K27me3 ???
#
# H3K4me1 and H3K27ac are associated with enhancer regions;
# H3K4me3 is associated with promoter regions;
# H3K36me3 is associated with transcribed regions in gene bodies;
# H3K27me3 is associated with Polycomb repression;
# H3K9me3 is associated with heterochromatin.
#
cpg_1Y
ahub = subset(ahub, species == "Homo sapiens")
qhub4 = query(ahub, c("H3K27me3")) # Get H3K27me3
qhub4
H3K27me3 <- qhub4[[ "AH23223" ]]
H3K27me3
table(seqlevels(H3K27me3))
seqlevels(H3K27me3)
seqlengths(H3K27me3)
hkm27st <- keepStandardChromosomes(H3K27me3, pruning.mode = "coarse" )
hkm27st
length(table(seqlevels(hkm27st )))
hkm27.1Y <-dropSeqlevels(hkm27st, "chrM", pruning.mode = "coarse" )
hkm27.1Y
width(hkm27.1Y)
range(width( hkm27.1Y )) # min 100 max 4611210
length(table(seqlevels(hkm27.1Y)))
length( cpg_1Y ) # 27 718 total CpGs
length( hkm27.1Y ) # 40 693 total histone marks
ovf<- findOverlaps(cpg_1Y, hkm27.1Y )
ovf # 7996 overlaps
ovc <- countOverlaps( cpg_1Y, hkm27.1Y )
ovc
ovs <- subsetByOverlaps(cpg_1Y, hkm27.1Y , ignore.strand = TRUE )
ovs
length(unique(queryHits(ovf))) # 7588 cpgs have that histone mark
length(unique(subjectHits(ovf))) # 3810 histone marks are within CpG
length(subsetByOverlaps(cpg_1Y, hkm27.1Y, ignore.strand = TRUE))/ length(cpg_1Y)
# 27 % of these CpG have a histone mark within them.
length(subsetByOverlaps(hkm27.1Y, cpg_1Y , ignore.strand = TRUE))/ length(hkm27.1Y)
# 9 % of these histone marks are located within CpG.
# How many Megabases do the histone marks call ?
sum(width(reduce(hkm27.1Y, ignore.strand = TRUE)))/ 10^6
# 861.3961 Mbp
# How many Megabases do the CpGs call ?
sum(width(reduce(cpg_1Y, ignore.strand = TRUE)))/10^6
# 21.15239 Mbp
# How big is the overlap ?
sum(width(intersect(hkm27.1Y, cpg_1Y, ignore.strand = TRUE)))/ 10^6
# 5.86855 Mbp
#Compute the Odd ratio:
# Is a measure of association. It quantifies the relationship.
inOut <- matrix(0, ncol= 2, nrow = 2)
colnames(inOut) <- c("in", "out")
rownames(inOut) <- c("in", "out")
inOut
inOut[1,1] <- sum(width(intersect(hkm27.1Y, cpg_1Y, ignore.strand = TRUE)))
inOut[1,2] <- sum(width(setdiff(hkm27.1Y,cpg_1Y , ignore.strand = TRUE)))
inOut[2,1] <- sum(width(setdiff(cpg_1Y , hkm27.1Y, ignore.strand = TRUE)))
inOut
inOut[2,2] = 3*10^9 - sum(inOut)
inOut
fisher.test(inOut)$statistic #ERROR
oddsRatio = inOut[1,1] * inOut[2,2] / (inOut[2,1] * inOut[1,2] )
oddsRatio # = 0.95 ; (< 1) weak association = no significant enrichment.
# H3K27me3-only promoters are CpG poor (from literature).
##_____________________ How about prom_gen1Y & hkm27.1Y ???
length(subsetByOverlaps(prom_gen1Y, hkm27.1Y, ignore.strand = TRUE))/ length(prom_gen1Y)
# 28 %
length(subsetByOverlaps(hkm27.1Y, prom_gen1Y, ignore.strand = TRUE))/ length(hkm27.1Y)
# 12 %
inOut <- matrix(0, ncol= 2, nrow = 2)
colnames(inOut) <- c("in", "out")
rownames(inOut) <- c("in", "out")
inOut
inOut[1,1] <- sum(width(intersect(hkm27.1Y, prom_gen1Y, ignore.strand = TRUE)))
inOut[1,2] <- sum(width(setdiff(hkm27.1Y,prom_gen1Y , ignore.strand = TRUE)))
inOut[2,1] <- sum(width(setdiff(prom_gen1Y , hkm27.1Y, ignore.strand = TRUE)))
inOut
inOut[2,2] = 3*10^9 - sum(inOut)
inOut
oddsRatio = inOut[1,1] * inOut[2,2] / (inOut[2,1] * inOut[1,2] )
oddsRatio # = 0.93 ; (< 1) weak association = no significant enrichment.
##_____________________ What about prom_gen1Y & H3K4me3 ???
prom_gen1Y
ahub = subset(ahub, species == "Homo sapiens")
qhub5 = query(ahub, c("H3K4me3", "Gm12878"))
qhub5
H3K4me3 <- qhub5[[1]]
H3K4me3
table(seqlevels(H3K4me3 ))
H3K4me3.aut <- keepSeqlevels(H3K4me3, paste0("chr", 1:22), pruning.mode = "coarse" )
H3K4me3.aut
length(subsetByOverlaps(prom_gen1Y, H3K4me3.aut, ignore.strand = TRUE))/ length(prom_gen1Y)
# 61 %
length(subsetByOverlaps(H3K4me3.aut, prom_gen1Y, ignore.strand = TRUE))/ length(H3K4me3.aut)
# 26 %
inOut <- matrix(0, ncol= 2, nrow = 2)
colnames(inOut) <- c("in", "out")
rownames(inOut) <- c("in", "out")
inOut
inOut[1,1] <- sum(width(intersect(H3K4me3.aut, prom_gen1Y, ignore.strand = TRUE)))
inOut[1,2] <- sum(width(setdiff(H3K4me3.aut,prom_gen1Y , ignore.strand = TRUE)))
inOut[2,1] <- sum(width(setdiff(prom_gen1Y , H3K4me3.aut, ignore.strand = TRUE)))
inOut
inOut[2,2] = 3*10^9 - sum(inOut)
inOut
oddsRatio = inOut[1,1] * inOut[2,2] / (inOut[2,1] * inOut[1,2] )
oddsRatio # = 14.5 ; (> 1) Strong association = significant enrichment.
##_____________________ what about CpG islands & H3K4me3 ???
length(subsetByOverlaps(cpg_1Y, H3K4me3.aut, ignore.strand = TRUE))/ length(cpg_1Y)
# 51 %
length(subsetByOverlaps(H3K4me3.aut, cpg_1Y, ignore.strand = TRUE))/ length(H3K4me3.aut)
# 26 %
inOut <- matrix(0, ncol= 2, nrow = 2)
colnames(inOut) <- c("in", "out")
rownames(inOut) <- c("in", "out")
inOut
inOut[1,1] <- sum(width(intersect(H3K4me3.aut, cpg_1Y, ignore.strand = TRUE)))
inOut[1,2] <- sum(width(setdiff(H3K4me3.aut,cpg_1Y , ignore.strand = TRUE)))
inOut[2,1] <- sum(width(setdiff(cpg_1Y , H3K4me3.aut, ignore.strand = TRUE)))
inOut
inOut[2,2] = 3*10^9 - sum(inOut)
inOut
oddsRatio = inOut[1,1] * inOut[2,2] / (inOut[2,1] * inOut[1,2] )
oddsRatio # = 22 ; ( >1) Strong association = significant enrichment.
#----03 SHOWING UP : Sequence Matching and Aligning using Biostrings------------
# BioConductor contains a number of string matching/pairwise
#alignment tools in the Biostrings package that can be invaluable
#in answering complex scientific questions.
dnaseq <- DNAString("ACGTACGT")
dnaseq2 <- DNAString("AGGTATCGT")
pairwiseAlignment(dnaseq, dnaseq2) # by default: type= "global"
pairwiseAlignment(dnaseq, dnaseq2, type= "local")
pairwiseAlignment(dnaseq, dnaseq2, scoreOnly= T ) #Get only the score.
pairwiseAlignment(dnaseq,Scerevisiae$chrV, type= "local")
pairwiseAlignment(dnaseq,Scerevisiae$chrI)
# generate a set of 16 sequences in order to map them to a sequence in Scerevesiae genome.
which(LETTERS== "A") #1
which(LETTERS== "C") # 3
which(LETTERS== "G") #7
which(LETTERS== "T") # 20
nuc <- LETTERS[c(1,3,7,20 )]
sample(nuc, 8, replace= T )
paste( sample(nuc, 8, replace= T ) , collapse="")
# A much quicker way to do it:
sample (DNA_BASES, 8, replace= T ) # create a generator of sequences
paste( sample (DNA_BASES, 8, replace= T ) , collapse="")
# BAM ! Here are my sequences :
setseq <- DNAStringSet( )
for (i in 1:16) {
setseq[i] <- paste( sample (DNA_BASES, 8, replace= T ) , collapse="")
}
setseq
# Alignment of these sequences to ScScerevisiae genome:
pairwiseAlignment( c( "ACAB","ABAC" ), "ACBA ") # If u wanna play with it !
pairwiseAlignment( setseq, Scerevisiae$chrI )
#About pairwiseAlignment():
# It aligns one or more strings specified in the pattern argument with a single
# string specified in the subject argument.
# It returns only the first occurrence of the best scoring alignment.
# If more than one pairwise alignment has the maximum alignment score exists,
# the first alignment along the subject is returned.
pairwiseAlignment( setseq, Scerevisiae$chrM, type= "local" )
vmatchPattern(setseq, Scerevisiae$chrI)
find.package("GenomicAlignments")
library("GenomicAlignments") # Rank 26 / 2140
BiocManager::install("msa") # Multiple Sequence Alignment
library("msa") # Rank 161 / 2140
# Load amino acid sequences from one of the example files that are supplied
# with the msa package:
mySequenceFile <- system.file("examples", "exampleAA.fasta", package="msa")
mySequences <- readAAStringSet(mySequenceFile)
mySequences
?readAAStringSet() # it contains a filepath
myFirstAlignment <- msa(mySequences) # runs ClustalW with default parameters
print(myFirstAlignment, show="complete")
args(msa)
# Alignment of our 16 seq. :
msa(setseq) # (ClustalW) # woow, that's awesome !
msa(setseq, "ClustalOmega") # a newer version of CLUSTALW
msa(setseq, "Muscle") # MUltiple Sequence Comparison by Log- Expectation
# we've got 3 different consensus sequence.
# Consensus Sequence: In a Nucleotide or an amino acid sequence,
#each base pair (an amino acid or a nucleotide) may occur more
#frequently at a particular region in different sequences of nature.
seqcer <- Scerevisiae$chrI[1:23 ]
# hint:
DNAStringSet(seqcer)
# Finally I managed to align my sequences to cerevisiae genome:
msa(c(setseq, DNAStringSet(seqcer) ) )
msa(c(setseq, DNAStringSet(Scerevisiae$chrI) ) ) # It takes ages !
# Get back consensus sequences
msaConsensusSequence( msa(c(setseq, DNAStringSet(seqcer) ) ) )
consensusString( msa(c(setseq, DNAStringSet(seqcer) ) ) )
# The main purpose of consensus sequences is to get an impression of
#conservation at individual positionsof a multiple alignment.
msaConsensusSequence(msa(c(setseq, DNAStringSet(Scerevisiae$chrI) ) ))
msa(setseq)
msaConsensusSequence(msa(setseq) )
consensusString(msa(setseq) ) # returns some arcane letters.
###--------04 SHOWING UP : Biostrings + Alignment ------------------------------
# Saccharomyces cerevisiae
letterFrequency(Scerevisiae$chrI, "GC", as.prob= TRUE)*100
sum(seqlengths(Scerevisiae)) # 12.163 Mbp = genome size
# Note:
#The term "megabase" (Mb) is commonly used inter-changeably with megabase pair (Mbp),
#although strictly this would refer to a single-stranded nucleic acid.
seqnames(Scerevisiae) # 16 chromosomes
seqlengths( Scerevisiae ) /10^6
alphabetFrequency(Scerevisiae$chrI)/10^6
#Gettting the GC content across the entire genome :
param <- new("BSParams", X = Scerevisiae, FUN = letterFrequency)
bsapply(param, "GC")
unlist(bsapply(param, "GC")
sum(unlist(bsapply(param, "GC")) / sum(seqlengths(Scerevisiae)))
# ~ 0.38 = 38 %
#Checking the GC % for each single X :
unlist(bsapply(param, "GC"))
available.genomes()
#Chlamydomonas reinhardtii genome
BiocManager::install("BSgenome.Creinhardtii.JGI.v5.6")
library(BSgenome.Creinhardtii.JGI.v5.6 )
Creinhardtii
seqnames(Creinhardtii) # 17 chromosomes
seqlengths( Creinhardtii ) /10^6
sum(seqlengths( Creinhardtii )) /10^6 # 111.1007 Mbp = genome size
alphabetFrequency(Creinhardtii$chromosome_13)/10^6
letterFrequency(Creinhardtii$chromosome_13, "GC", as.prob= TRUE)*100
# 62 %
sort(dinucleotideFrequency(Creinhardtii$chromosome_13)/10^3)
# % of each dinucltd.
sort (dinucleotideFrequency(Creinhardtii$chromosome_13, as.prob= TRUE )*100)
# %A %T %C %G :
alphabetFrequency(Creinhardtii$chromosome_13, as.prob= TRUE)*100
# GC content across the entire genome
param <- new("BSParams", X = Creinhardtii, FUN = letterFrequency)
bsapply(param, "GC")
unlist(bsapply(param, "GC")
# % GC across the whole genome:
(sum(unlist(bsapply(param, "GC")) / sum(seqlengths(Creinhardtii)))) *100
# 62 %
#Checking the GC % for each single X
unlist(bsapply(param, "GC", as.prob = TRUE))
# Let's play for a while with these genomes :
seqalgae <- Creinhardtii$chromosome_1[1:16]
seqyeast <- Scerevisiae$chrI[49:64]
#Note : An optimal alignment is an alignment giving the highest score.
pairwiseAlignment(seqalgae, seqyeast)
pairwiseAlignment(seqalgae, seqyeast, type= "local") # That makes sense.