diff --git a/01-getting-started/getting-started.html b/01-getting-started/getting-started.html index 644caaac..1f423124 100644 --- a/01-getting-started/getting-started.html +++ b/01-getting-started/getting-started.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1834,7 +1839,7 @@

References

theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/02-microarray/00-intro-to-microarray.html b/02-microarray/00-intro-to-microarray.html index 22c51e34..12946bbd 100644 --- a/02-microarray/00-intro-to-microarray.html +++ b/02-microarray/00-intro-to-microarray.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1715,7 +1720,7 @@

CCDL for ALSF

theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/02-microarray/clustering_microarray_01_heatmap.html b/02-microarray/clustering_microarray_01_heatmap.html index 184e75dc..cd1b6c78 100644 --- a/02-microarray/clustering_microarray_01_heatmap.html +++ b/02-microarray/clustering_microarray_01_heatmap.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1897,7 +1902,7 @@

4.4 Create a heatmap

))(25), scale = "row" # Scale values in the direction of genes (rows) ) -

+

We’ve created a heatmap but although our genes and samples are clustered, there is not much information that we can gather here because we did not provide the pheatmap() function with annotation labels for our samples.

First let’s save our clustered heatmap.

@@ -1914,8 +1919,8 @@

4.4.1 Save heatmap as a PNG

# Close the png file: dev.off() -
## quartz_off_screen 
-##                 2
+
## png 
+##   2

Now, let’s add some annotation bars to our heatmap.

@@ -1959,7 +1964,7 @@

4.5.1 Create annotated heatmap -

+

Now that we have annotation bars on our heatmap, we have a better idea of the cell line and treatment groups that appear to cluster together. More specifically, we can see that the samples seem to cluster by their cell lines of origin, but not necessarily as much by whether or not they received the PLX4302 treatment.

Let’s save our annotated heatmap.

@@ -1977,8 +1982,8 @@

4.5.2 Save annotated heatmap as a # Close the png file: dev.off() -
## quartz_off_screen 
-##                 2
+
## png 
+##   2
@@ -1994,35 +1999,40 @@

6 Print session info

At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

# Print session info
 sessionInfo()
-
## R version 3.6.1 (2019-07-05)
-## Platform: x86_64-apple-darwin15.6.0 (64-bit)
-## Running under: macOS Mojave 10.14.6
+
## R version 4.0.2 (2020-06-22)
+## Platform: x86_64-pc-linux-gnu (64-bit)
+## Running under: Ubuntu 20.04 LTS
 ## 
 ## Matrix products: default
-## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
-## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
+## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
 ## 
 ## locale:
-## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
+##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
+##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
+##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
 ## 
 ## attached base packages:
 ## [1] stats     graphics  grDevices utils     datasets  methods   base     
 ## 
 ## other attached packages:
-## [1] magrittr_1.5    pheatmap_1.0.12 optparse_1.6.2 
+## [1] magrittr_1.5    pheatmap_1.0.12 optparse_1.6.6 
 ## 
 ## loaded via a namespace (and not attached):
-##  [1] Rcpp_1.0.4.6       pillar_1.4.4       compiler_3.6.1     RColorBrewer_1.1-2
-##  [5] tools_3.6.1        digest_0.6.25      evaluate_0.14      lifecycle_0.2.0   
-##  [9] tibble_3.0.1       gtable_0.3.0       pkgconfig_2.0.3    rlang_0.4.6       
-## [13] cli_2.0.2          rstudioapi_0.11    yaml_2.2.1         xfun_0.14         
-## [17] dplyr_0.8.3        withr_2.2.0        styler_1.1.1       stringr_1.4.0     
-## [21] knitr_1.28         vctrs_0.3.1        hms_0.5.3          tidyselect_0.2.5  
-## [25] grid_3.6.1         getopt_1.20.3      glue_1.4.1         R6_2.4.1          
-## [29] fansi_0.4.1        rmarkdown_1.14     readr_1.3.1        purrr_0.3.4       
-## [33] rematch2_2.1.0     backports_1.1.7    scales_1.0.0       ellipsis_0.3.1    
-## [37] htmltools_0.3.6    assertthat_0.2.1   colorspace_1.4-1   utf8_1.1.4        
-## [41] stringi_1.4.6      munsell_0.5.0      crayon_1.3.4
+## [1] Rcpp_1.0.5 pillar_1.4.6 compiler_4.0.2 RColorBrewer_1.1-2 +## [5] R.methodsS3_1.8.0 R.utils_2.9.2 tools_4.0.2 digest_0.6.25 +## [9] gtable_0.3.0 evaluate_0.14 lifecycle_0.2.0 tibble_3.0.3 +## [13] R.cache_0.14.0 pkgconfig_2.0.3 rlang_0.4.7 cli_2.0.2 +## [17] rstudioapi_0.11 yaml_2.2.1 xfun_0.16 dplyr_1.0.0 +## [21] styler_1.3.2 stringr_1.4.0 knitr_1.29 generics_0.0.2 +## [25] vctrs_0.3.2 hms_0.5.3 tidyselect_1.1.0 grid_4.0.2 +## [29] getopt_1.20.3 glue_1.4.1 R6_2.4.1 fansi_0.4.1 +## [33] rmarkdown_2.3 farver_2.0.3 purrr_0.3.4 readr_1.3.1 +## [37] rematch2_2.1.2 scales_1.1.1 backports_1.1.9 ellipsis_0.3.1 +## [41] htmltools_0.5.0 assertthat_0.2.1 colorspace_1.4-1 utf8_1.1.4 +## [45] stringi_1.4.6 munsell_0.5.0 crayon_1.3.4 R.oo_1.23.0

Gu Z., R. Eils, and M. Schlesner, 2016 Complex heatmaps reveal patterns and correlations in multidimensional genomic data. Bioinformatics.

@@ -2086,7 +2096,7 @@

6 Print session info

theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/02-microarray/differential-expression_microarray_01.html b/02-microarray/differential-expression_microarray_01.html index 9b636094..9d10c04d 100644 --- a/02-microarray/differential-expression_microarray_01.html +++ b/02-microarray/differential-expression_microarray_01.html @@ -1619,6 +1619,7 @@
  • Differential Expression
  • Dimension Reduction - PCA
  • Dimension Reduction - UMAP
  • +
  • Ensembl Gene ID Annotation
  • @@ -1629,7 +1630,7 @@
    diff --git a/02-microarray/dimension-reduction_microarray_01_pca.html b/02-microarray/dimension-reduction_microarray_01_pca.html index 912e8f66..9badfad8 100644 --- a/02-microarray/dimension-reduction_microarray_01_pca.html +++ b/02-microarray/dimension-reduction_microarray_01_pca.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -2175,12 +2180,12 @@

    4.3 Perform Principal Components ## GSM917288 -0.3817511 -0.4095233 -0.6309443 0.5985548 -0.4305740 0.3042307 ## GSM917230 -1.0901163 -2.1937892 -1.9281503 -1.1493521 -0.4783177 0.6360478 ## PC284 PC285 -## GSM917111 9.52572473 -4.615319e-14 -## GSM917250 1.33152530 -6.445799e-15 -## GSM917281 -0.25177843 1.736458e-15 -## GSM917062 1.11478570 1.382228e-14 -## GSM917288 -0.07702027 -3.591745e-14 -## GSM917230 -1.67762146 -4.780139e-15 +## GSM917111 9.52572473 -2.012539e-14 +## GSM917250 1.33152530 -1.168683e-14 +## GSM917281 -0.25177843 -1.847134e-14 +## GSM917062 1.11478570 2.137179e-15 +## GSM917288 -0.07702027 -1.411024e-14 +## GSM917230 -1.67762146 -2.067790e-14

    We can see that we now have 285 principal component values for each of our samples.

    @@ -2232,7 +2237,7 @@

    4.6 Plot PCA Results

    # Print out plot here pca_plot -

    +

    Looks like Group 4 and SHH groups somewhat cluster with each other but Group 3 seems to be less distinct as there are some samples clustering with Group 4 as well.

    We can add another label to our plot to get more information about our dataset. Let’s also label the data points based on the histological subtype that each sample belongs to.

    # Make a scatterplot with ggplot2
    @@ -2250,7 +2255,7 @@ 

    4.6 Plot PCA Results

    # Print out plot here pca_plot
    -

    +

    Adding the histological subtype label to our plot made our plot more informative, but the diffuse Group 3 data doesn’t appear to be related to a histology subtype. We could test out other variables as annotation labels to get a further understanding of the cluster behavior of each subgroup.

    @@ -2281,35 +2286,40 @@

    6 Session info

    At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

    # Print session info
     sessionInfo()
    -
    ## R version 3.6.1 (2019-07-05)
    -## Platform: x86_64-apple-darwin15.6.0 (64-bit)
    -## Running under: macOS Mojave 10.14.6
    +
    ## R version 4.0.2 (2020-06-22)
    +## Platform: x86_64-pc-linux-gnu (64-bit)
    +## Running under: Ubuntu 20.04 LTS
     ## 
     ## Matrix products: default
    -## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
    -## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
    +## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
     ## 
     ## locale:
    -## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
    +##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    +##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    +##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    +##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    +##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    +## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     ## 
     ## attached base packages:
     ## [1] stats     graphics  grDevices utils     datasets  methods   base     
     ## 
     ## other attached packages:
    -## [1] magrittr_1.5   ggplot2_3.2.1  optparse_1.6.2
    +## [1] magrittr_1.5   ggplot2_3.3.2  optparse_1.6.6
     ## 
     ## loaded via a namespace (and not attached):
    -##  [1] Rcpp_1.0.4.6     pillar_1.4.4     compiler_3.6.1   tools_3.6.1     
    -##  [5] digest_0.6.25    evaluate_0.14    lifecycle_0.2.0  tibble_3.0.1    
    -##  [9] gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.6      cli_2.0.2       
    -## [13] rstudioapi_0.11  yaml_2.2.1       xfun_0.14        dplyr_0.8.3     
    -## [17] withr_2.2.0      styler_1.1.1     stringr_1.4.0    knitr_1.28      
    -## [21] vctrs_0.3.1      hms_0.5.3        tidyselect_0.2.5 grid_3.6.1      
    -## [25] getopt_1.20.3    glue_1.4.1       R6_2.4.1         fansi_0.4.1     
    -## [29] rmarkdown_1.14   readr_1.3.1      purrr_0.3.4      rematch2_2.1.0  
    -## [33] backports_1.1.7  scales_1.0.0     ellipsis_0.3.1   htmltools_0.3.6 
    -## [37] assertthat_0.2.1 colorspace_1.4-1 labeling_0.3     stringi_1.4.6   
    -## [41] lazyeval_0.2.2   munsell_0.5.0    crayon_1.3.4
    +## [1] Rcpp_1.0.5 pillar_1.4.6 compiler_4.0.2 R.methodsS3_1.8.0 +## [5] R.utils_2.9.2 tools_4.0.2 digest_0.6.25 evaluate_0.14 +## [9] lifecycle_0.2.0 tibble_3.0.3 gtable_0.3.0 R.cache_0.14.0 +## [13] pkgconfig_2.0.3 rlang_0.4.7 cli_2.0.2 rstudioapi_0.11 +## [17] yaml_2.2.1 xfun_0.16 withr_2.2.0 dplyr_1.0.0 +## [21] styler_1.3.2 stringr_1.4.0 knitr_1.29 generics_0.0.2 +## [25] vctrs_0.3.2 hms_0.5.3 tidyselect_1.1.0 grid_4.0.2 +## [29] getopt_1.20.3 glue_1.4.1 R6_2.4.1 fansi_0.4.1 +## [33] rmarkdown_2.3 farver_2.0.3 purrr_0.3.4 readr_1.3.1 +## [37] rematch2_2.1.2 scales_1.1.1 backports_1.1.9 ellipsis_0.3.1 +## [41] htmltools_0.5.0 assertthat_0.2.1 colorspace_1.4-1 labeling_0.3 +## [45] stringi_1.4.6 munsell_0.5.0 crayon_1.3.4 R.oo_1.23.0

    Brems M., 2017 A one-stop shop for principal component analysis

    @@ -2382,7 +2392,7 @@

    6 Session info

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/02-microarray/dimension-reduction_microarray_02_umap.html b/02-microarray/dimension-reduction_microarray_02_umap.html index ceae7f5f..0787b7e7 100644 --- a/02-microarray/dimension-reduction_microarray_02_umap.html +++ b/02-microarray/dimension-reduction_microarray_02_umap.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1864,13 +1869,13 @@

    4.3 Perform Uniform Manifold Appr )

    Let’s take a preview at the data frame we created in the chunk above.

    head(umap_df)
    -
    ##   refinebio_accession_code           X1       X2       histology subgroup
    -## 1                GSM917111  0.980478303 3.498827         Classic  Group 4
    -## 2                GSM917250  0.007078067 0.164898         Classic  Group 4
    -## 3                GSM917281  0.871498571 1.661061         Classic  Group 4
    -## 4                GSM917062 -2.389040189 2.977578         Classic  Group 3
    -## 5                GSM917288  2.706804399 4.452762         Classic  Group 4
    -## 6                GSM917230 -0.750989064 2.847606 Medulloblastoma  Group 4
    +
    ##   refinebio_accession_code         X1         X2       histology subgroup
    +## 1                GSM917111 3.34604756  0.7845626         Classic  Group 4
    +## 2                GSM917250 2.33114827  4.0503256         Classic  Group 4
    +## 3                GSM917281 2.52365294  2.4093729         Classic  Group 4
    +## 4                GSM917062 0.02614889  1.3860845         Classic  Group 3
    +## 5                GSM917288 4.15696395 -1.0232268         Classic  Group 4
    +## 6                GSM917230 1.20599505  1.2974269 Medulloblastoma  Group 4

    Now, let’s plot the UMAP scores.

    # Make a scatterplot using `ggplot2` functionality
     umap_plot <- ggplot(
    @@ -1885,7 +1890,7 @@ 

    4.3 Perform Uniform Manifold Appr # Print out plot here umap_plot

    -

    +

    It’s hard to interpret our UMAP results without some metadata labels on our plot.

    Let’s label the data points based on their genotype subgroup since this is central to the subgroup specific based hypothesis in the original paper (Northcott et al. 2012).

    # Make a scatterplot with ggplot2
    @@ -1902,7 +1907,7 @@ 

    4.3 Perform Uniform Manifold Appr # Print out plot here umap_plot

    -

    +

    It looks like Group 4 and SHH groups somewhat cluster with each other but Group 3 seems to also be clustering with Group 4.

    We can add another label to our plot to potentially gain insight on the clustering behavior of our data. Let’s also label the data points based on the histological subtype that each sample belongs to.

    # Make a scatterplot with ggplot2
    @@ -1920,7 +1925,7 @@ 

    4.3 Perform Uniform Manifold Appr # Print out plot here umap_plot

    -

    +

    Our histological subtype groups don’t appear to be clustering in a discernible pattern. We could test out other variables as annotation labels to get a further understanding of the cluster behavior of each subgroup.

    @@ -1949,37 +1954,42 @@

    6 Session info

    At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

    # Print session info
     sessionInfo()
    -
    ## R version 3.6.1 (2019-07-05)
    -## Platform: x86_64-apple-darwin15.6.0 (64-bit)
    -## Running under: macOS Mojave 10.14.6
    +
    ## R version 4.0.2 (2020-06-22)
    +## Platform: x86_64-pc-linux-gnu (64-bit)
    +## Running under: Ubuntu 20.04 LTS
     ## 
     ## Matrix products: default
    -## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
    -## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
    +## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
     ## 
     ## locale:
    -## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
    +##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    +##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    +##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    +##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    +##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    +## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     ## 
     ## attached base packages:
     ## [1] stats     graphics  grDevices utils     datasets  methods   base     
     ## 
     ## other attached packages:
    -## [1] magrittr_1.5   ggplot2_3.2.1  umap_0.2.3     optparse_1.6.2
    +## [1] magrittr_1.5   ggplot2_3.3.2  umap_0.2.6.0   optparse_1.6.6
     ## 
     ## loaded via a namespace (and not attached):
    -##  [1] Rcpp_1.0.4.6     RSpectra_0.15-0  pillar_1.4.4     compiler_3.6.1  
    -##  [5] tools_3.6.1      digest_0.6.25    gtable_0.3.0     jsonlite_1.7.0  
    -##  [9] lattice_0.20-38  evaluate_0.14    lifecycle_0.2.0  tibble_3.0.1    
    -## [13] pkgconfig_2.0.3  rlang_0.4.6      Matrix_1.2-17    cli_2.0.2       
    -## [17] rstudioapi_0.11  yaml_2.2.1       xfun_0.14        dplyr_0.8.3     
    -## [21] withr_2.2.0      styler_1.1.1     stringr_1.4.0    knitr_1.28      
    -## [25] vctrs_0.3.1      askpass_1.1      hms_0.5.3        tidyselect_0.2.5
    -## [29] grid_3.6.1       getopt_1.20.3    reticulate_1.13  glue_1.4.1      
    -## [33] R6_2.4.1         fansi_0.4.1      rmarkdown_1.14   readr_1.3.1     
    -## [37] purrr_0.3.4      rematch2_2.1.0   scales_1.0.0     backports_1.1.7 
    -## [41] ellipsis_0.3.1   htmltools_0.3.6  assertthat_0.2.1 colorspace_1.4-1
    -## [45] labeling_0.3     stringi_1.4.6    lazyeval_0.2.2   munsell_0.5.0   
    -## [49] openssl_1.4.1    crayon_1.3.4
    +## [1] reticulate_1.16 styler_1.3.2 tidyselect_1.1.0 xfun_0.16 +## [5] rematch2_2.1.2 purrr_0.3.4 lattice_0.20-41 colorspace_1.4-1 +## [9] vctrs_0.3.2 generics_0.0.2 htmltools_0.5.0 getopt_1.20.3 +## [13] yaml_2.2.1 rlang_0.4.7 R.oo_1.23.0 pillar_1.4.6 +## [17] glue_1.4.1 withr_2.2.0 R.utils_2.9.2 R.cache_0.14.0 +## [21] lifecycle_0.2.0 stringr_1.4.0 munsell_0.5.0 gtable_0.3.0 +## [25] R.methodsS3_1.8.0 evaluate_0.14 labeling_0.3 knitr_1.29 +## [29] fansi_0.4.1 Rcpp_1.0.5 readr_1.3.1 openssl_1.4.2 +## [33] backports_1.1.9 scales_1.1.1 jsonlite_1.7.0 farver_2.0.3 +## [37] RSpectra_0.16-0 hms_0.5.3 askpass_1.1 digest_0.6.25 +## [41] stringi_1.4.6 dplyr_1.0.0 grid_4.0.2 cli_2.0.2 +## [45] tools_4.0.2 tibble_3.0.3 crayon_1.3.4 pkgconfig_2.0.3 +## [49] ellipsis_0.3.1 Matrix_1.2-18 assertthat_0.2.1 rmarkdown_2.3 +## [53] rstudioapi_0.11 R6_2.4.1 compiler_4.0.2

    Konopka T., 2020 Uniform manifold approximation and projection.

    @@ -2052,7 +2062,7 @@

    6 Session info

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/02-microarray/gene-id-annotation_microarray_01_ensembl.html b/02-microarray/gene-id-annotation_microarray_01_ensembl.html index ecc1dfbe..c00b3c94 100644 --- a/02-microarray/gene-id-annotation_microarray_01_ensembl.html +++ b/02-microarray/gene-id-annotation_microarray_01_ensembl.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1783,6 +1788,8 @@

    4.1 Install libraries

    # Install this package if it isn't installed yet BiocManager::install("org.Mm.eg.db", update = FALSE) } +
    ## Bioconductor version 3.11 (BiocManager 1.30.10), R 4.0.2 (2020-06-22)
    +
    ## Installing package(s) 'org.Mm.eg.db'

    Attach the packages we need for this analysis.

    # Attach the library
     library(org.Mm.eg.db)
    @@ -1884,9 +1891,10 @@

    4.2 Import and set up data

    Let’s ensure that the metadata and data are in the same sample order.

    # Make the data in the order of the metadata
     df <- df %>%
    -  dplyr::select(metadata$geo_accession)
    -
    -# Check if this is in the same order
    +  dplyr::select(metadata$geo_accession)
    +
    ## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
    +## when loading 'dplyr'
    +
    # Check if this is in the same order
     all.equal(colnames(df), metadata$geo_accession)
    ## [1] TRUE
    # Bring back the "Gene" column in preparation for mapping
    @@ -1946,32 +1954,8 @@ 

    4.4 Explore gene ID conversionWe can see that our data frame has a new column Symbol. Let’s get a summary of the gene symbols returned in the Symbol column of our mapped data frame.

    # We can use the `summary()` function to get a better idea of the distribution of symbols in the `Symbol` column
     summary(mapped_df$Symbol)
    -
    ##  Gm13023   Nudt10     Pms2    Gnai3     Pbsn    Cdc45      H19    Scml2 
    -##        2        2        2        1        1        1        1        1 
    -##     Apoh     Narf     Cav2     Klf6    Scmh1    Cox5a     Tbx2     Tbx4 
    -##        1        1        1        1        1        1        1        1 
    -##     Zfy2     Ngfr     Wnt3    Wnt9a      Fer     Xpo6     Tfe3    Axin2 
    -##        1        1        1        1        1        1        1        1 
    -##    Brat1    Gna12 Slc22a18   Itgb2l    Igsf5   Pih1d2     Dlat     Sdhd 
    -##        1        1        1        1        1        1        1        1 
    -##    Fgf23     Fgf6    Ccnd2   Gpr107    Nalcn   Btbd17    Slfn4       Th 
    -##        1        1        1        1        1        1        1        1 
    -##     Ins2   Scnn1g     Drp2  Tspan32     Lhx2   Clec2g     Gmpr    Glra1 
    -##        1        1        1        1        1        1        1        1 
    -##     Mid2   Trim25     Dgke   Scpep1      Mnt    Itgb2    Hddc2  Tpd52l1 
    -##        1        1        1        1        1        1        1        1 
    -##     Pemt     Cdh1     Cdh4    Ckmt1    Bcl6b  Clec10a   Alox12    Arvcf 
    -##        1        1        1        1        1        1        1        1 
    -##     Comt     Rtca      Dbt   Dazap2    Mcts1     Rem1    Rnf17 Trappc10 
    -##        1        1        1        1        1        1        1        1 
    -##     Ccm2      Wap    Tbrg4  Tmprss2      Mx1      Fap      Gcg   Ndufa9 
    -##        1        1        1        1        1        1        1        1 
    -##    Egfl6      Lck    Tssk3  Cttnbp2   Galnt1     Myf5    Mkrn2    Pparg 
    -##        1        1        1        1        1        1        1        1 
    -##     Raf1   Gm4532    Sept1    Pdgfb   Acvrl1    Grasp   Acvr1b   Tom1l2 
    -##        1        1        1        1        1        1        1        1 
    -##    Gpa33  Zfp385a  (Other)     NA's 
    -##        1        1    16885      998
    +
    ##    Length     Class      Mode 
    +##     17977 character character

    There are 998 NA’s in our data frame, which means that 998 out of the 17918 Ensembl IDs did not map to gene symbols. 998 out of 17918 is not too bad a rate, in our opinion, but note that different gene identifier types will have different mapping rates and that is to be expected. Regardless, it is always good to be aware of how many genes you are potentially “losing” if you rely on this new gene identifier you’ve mapped to for downstream analyses.

    However, if you have almost all NA’s it is possible that the function was executed incorrectly or you may want to consider using a different gene identifier, if possible.

    Now let’s check to see if we have any genes that were mapped to multiple symbols.

    @@ -1990,7 +1974,7 @@

    4.4 Explore gene ID conversion## # A tibble: 6 x 3 ## # Groups: Ensembl [2] ## Symbol Ensembl gene_symbol_count -## <fct> <chr> <int> +## <chr> <chr> <int> ## 1 Rpl23 ENSMUSG00000071415 3 ## 2 LOC100044627 ENSMUSG00000071415 3 ## 3 LOC100862455 ENSMUSG00000071415 3 @@ -2040,39 +2024,48 @@

    6 Session info

    At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

    # Print session info
     sessionInfo()
    -
    ## R version 3.6.1 (2019-07-05)
    -## Platform: x86_64-apple-darwin15.6.0 (64-bit)
    -## Running under: macOS Mojave 10.14.6
    +
    ## R version 4.0.2 (2020-06-22)
    +## Platform: x86_64-pc-linux-gnu (64-bit)
    +## Running under: Ubuntu 20.04 LTS
     ## 
     ## Matrix products: default
    -## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
    -## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
    +## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
     ## 
     ## locale:
    -## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
    +##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    +##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    +##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    +##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    +##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    +## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     ## 
     ## attached base packages:
     ## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
     ## [8] methods   base     
     ## 
     ## other attached packages:
    -## [1] magrittr_1.5         org.Mm.eg.db_3.8.2   AnnotationDbi_1.47.0
    -## [4] IRanges_2.19.10      S4Vectors_0.23.23    Biobase_2.45.0      
    -## [7] BiocGenerics_0.31.5  optparse_1.6.2      
    +## [1] magrittr_1.5         org.Mm.eg.db_3.11.4  AnnotationDbi_1.50.3
    +## [4] IRanges_2.22.2       S4Vectors_0.26.1     Biobase_2.48.0      
    +## [7] BiocGenerics_0.34.0  optparse_1.6.6      
     ## 
     ## loaded via a namespace (and not attached):
    -##  [1] Rcpp_1.0.4.6     plyr_1.8.4       pillar_1.4.4     compiler_3.6.1  
    -##  [5] tools_3.6.1      bit_1.1-14       digest_0.6.25    memoise_1.1.0   
    -##  [9] RSQLite_2.1.2    evaluate_0.14    lifecycle_0.2.0  tibble_3.0.1    
    -## [13] pkgconfig_2.0.3  rlang_0.4.6      DBI_1.0.0        cli_2.0.2       
    -## [17] rstudioapi_0.11  yaml_2.2.1       xfun_0.14        dplyr_0.8.3     
    -## [21] withr_2.2.0      styler_1.1.1     stringr_1.4.0    knitr_1.28      
    -## [25] vctrs_0.3.1      hms_0.5.3        tidyselect_0.2.5 bit64_0.9-7     
    -## [29] getopt_1.20.3    glue_1.4.1       R6_2.4.1         fansi_0.4.1     
    -## [33] rmarkdown_1.14   reshape2_1.4.3   blob_1.2.0       readr_1.3.1     
    -## [37] purrr_0.3.4      rematch2_2.1.0   backports_1.1.7  ellipsis_0.3.1  
    -## [41] htmltools_0.3.6  assertthat_0.2.1 utf8_1.1.4       stringi_1.4.6   
    -## [45] crayon_1.3.4
    +## [1] Rcpp_1.0.5 plyr_1.8.6 pillar_1.4.6 +## [4] compiler_4.0.2 BiocManager_1.30.10 R.methodsS3_1.8.1 +## [7] R.utils_2.10.1 tools_4.0.2 bit_1.1-15.2 +## [10] digest_0.6.25 memoise_1.1.0 RSQLite_2.2.0 +## [13] evaluate_0.14 lifecycle_0.2.0 tibble_3.0.3 +## [16] R.cache_0.14.0 pkgconfig_2.0.3 rlang_0.4.7 +## [19] DBI_1.1.0 cli_2.0.2 rstudioapi_0.11 +## [22] yaml_2.2.1 xfun_0.17 dplyr_1.0.0 +## [25] styler_1.3.2 stringr_1.4.0 knitr_1.29 +## [28] generics_0.0.2 vctrs_0.3.4 hms_0.5.3 +## [31] tidyselect_1.1.0 bit64_0.9-7.1 getopt_1.20.3 +## [34] glue_1.4.2 R6_2.4.1 fansi_0.4.1 +## [37] rmarkdown_2.3 reshape2_1.4.4 blob_1.2.1 +## [40] purrr_0.3.4 readr_1.3.1 rematch2_2.1.2 +## [43] backports_1.1.9 ellipsis_0.3.1 htmltools_0.5.0 +## [46] assertthat_0.2.1 utf8_1.1.4 stringi_1.5.3 +## [49] crayon_1.3.4 R.oo_1.24.0

    Carlson M., 2019 Genome wide annotation for mouse

    @@ -2142,7 +2135,7 @@

    6 Session info

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/03-rnaseq/00-intro-to-rnaseq.Rmd b/03-rnaseq/00-intro-to-rnaseq.Rmd index e8d1229b..d4acc82f 100644 --- a/03-rnaseq/00-intro-to-rnaseq.Rmd +++ b/03-rnaseq/00-intro-to-rnaseq.Rmd @@ -6,3 +6,131 @@ output: toc: true toc_float: true --- + + + +**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* + +- [Introduction to RNA-seq technology](#introduction-to-rna-seq-technology) + - [RNA-seq data **strengths**](#rna-seq-data-strengths) + - [RNA-seq data **limitations/biases**](#rna-seq-data-limitationsbiases) + - [About quantile normalization](#about-quantile-normalization) + - [More resources on RNA-seq technology](#more-resources-on-rna-seq-technology) +- [About DESeq2](#about-deseq2) + - [DESeq2 objects](#deseq2-objects) + - [DESeq2 transformation methods](#deseq2-transformation-methods) + - [Further resources for DESeq2](#further-resources-for-deseq2) + - [Why isn't the gene I care about in a refine.bio dataset?](#why-isnt-the-gene-i-care-about-in-a-refinebio-dataset) + - [What about edgeR?](#what-about-edger) + - [What if I care about isoforms?](#what-if-i-care-about-isoforms) +- [References](#references) + + + +## Introduction to RNA-seq technology + +Data analyses are generally not "one size fits all"; this is particularly true between RNA-seq vs microarray data. +This tutorial has example analyses [organized by technology](../01-getting-started/getting-started.html#about-how-this-tutorial-book-is-structured) so you can follow examples that are more closely tailored to the nature of the data at hand. + +As with all experimental methods, RNA-seq has strengths and limitations that you should consider in regards to your scientific questions. + +### RNA-seq data **strengths** + +- RNA-seq can assay unknown transcripts, as it is not bound to a pre-determined set of probes like microarrays [@Zhong2009]. +- Its values are considered more dynamic than microarray values which are constrained to a smaller range based on background signal and probesets being saturated [@Zhong2009]. + +### RNA-seq data **limitations/biases** + +The nature of sequencing introduces several different kinds of biases: + +- **GC bias**: higher GC content sequences are less likely to be observed. +- **3' bias (positional bias)**: for most sequencing methods, the 3 prime end of transcripts are more likely to be observed. +- **Complexity bias**: some sequences are easier to be bound and amplified than others. +- **Library size or sequencing depth**: the total number of reads is not always equivalent between samples. +- **Gene length**: longer genes are more likely to be observed. + +@bias-blog discusses these biases in this [blog post](https://mikelove.wordpress.com/2016/09/26/rna-seq-fragment-sequence-bias/) which includes this handy figure. + + + +Most normalization methods, including [refine.bio's processing methods](http://docs.refine.bio/en/latest/main_text.html#rna-seq-pipelines), attempt to mitigate these biases, but these biases can never be fully negated. +Some of these biases have been addressed to the extent that they can by our refine.bio processing methods so you don't have to worry too much about them. +In brief, refine.bio data is quantified by Salmon using their correction algorithms: [`--seqbias`](https://salmon.readthedocs.io/en/latest/salmon.html#seqbias) and [`--gcbias`](https://salmon.readthedocs.io/en/latest/salmon.html#gcbias). + +### About quantile normalization + +refine.bio data is available for you [quantile normalized](https://en.wikipedia.org/wiki/Quantile_normalization), which can address some library size biases. +But more often than not, our example modules will recommend using the option for downloading non-quantile normalized data (note that this is RNA-seq specific, and microarray data does not have this download option). + + + +See here for more about the [quantile normalization process in refine.bio](http://docs.refine.bio/en/latest/main_text.html#quantile-normalization). + +### More resources on RNA-seq technology + +- [StatsQuest: A gentle introduction to RNA-seq](https://www.youtube.com/watch?v=tlf6wYJrwKY) [@Starmer2017-rnaseq]. +- [A general background on the wet lab methods of RNA-seq](https://bitesizebio.com/13542/what-everyone-should-know-about-rna-seq/) [@Hadfield2016]. +- [Modeling of RNA-seq fragment sequence bias reduces systematic errors in transcript abundance estimation](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5143225/) [@Love2016]. +- [Mike Love blog post about sequencing biases]( https://mikelove.wordpress.com/2016/09/26/rna-seq-fragment-sequence-bias/) [@bias-blog] +- [Biases in Illumina transcriptome sequencing caused by random hexamer priming](https://pdfs.semanticscholar.org/9d16/997f5de72d6c606fef3d673db70e5d1d8e1e.pdf?_ga=2.131436679.965169313.1600175795-124991789.1600175795) [@Hansen2010]. +- [Computation for RNA-seq and ChIP-seq studies](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4121056/) [@Pepke2009]. + +## About DESeq2 + +DESeq2 is an R package that is built for differential expression analysis but also has other useful functions for prepping RNA-seq counts data [@Love2014]. +Our refine.bio data is [summarized to the gene-level with tximport](http://docs.refine.bio/en/latest/main_text.html#processing-information) before you download it [@Soneson2015] which is also from the same creators as DESeq2, so they are made to play well together. +We generally like DESeq2 because it has [great documentation and helpful tutorials](#further-resources-for-deseq2). + +### DESeq2 objects + +Many R Bioconductor packages have specialized object types they want your data to be formatted as. +For DESeq2, before we can use a lot the special functions, we need to get our data into a [`DESeqDataSet` object](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/DESeqDataSet-class). +`DESeqDataSet` objects not only store your data, but additional transformations of your data, model information, etc. + +From our refine.bio datasets, we will use a function `DESeqDataSetFromMatrix()` to create our [`DESeqDataSet` objects](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/DESeqDataSet-class). +This DESeq2 function requires you provide counts and *not* a normalized or corrected value like [TPMs](https://www.youtube.com/watch?v=TTUrtCY2k-w). +Which is why our examples advise downloading [non-quantile normalized](#about-quantile-normalization) from refine.bio. + +### DESeq2 transformation methods + +Our examples recommend using DESeq2 for normalizing your RNA-seq data. +You may have heard about or worked with FPKM, TPM, RPKMs; how does DESeq2's normalization compare? +This [handy table from an online Harvard Bioinformatics Core course nicely summarizes and compares these different methods](https://hbctraining.github.io/DGE_workshop_salmon/lessons/02_DGE_count_normalization.html#common-normalization-methods) [@dge-workshop-deseq2]. +For more about the steps behind DESeq2 normalization, we highly recommend this [StatsQuest video](https://www.youtube.com/watch?v=UFB993xufUU) which explains it quite nicely [@Starmer2017-deseq2]. + +To normalize and transform our data with DESeq2, we generally use `vst()` (variance stabilizing transformation) or `rlog()` (regularized logarithm transformation). +[Both methods are very similar](http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#the-variance-stabilizing-transformation-and-the-rlog). +Both _normalize_ your data by correcting for library size differences but they also _transform_ your data [removing the dependence of the variance on the mean](https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#count-data-transformations), meaning that low mean genes won't have inflated variance from just one or a few samples having higher values than the rest [@Love2020]. +Of the two methods, `rlog()` takes a bit longer to run [@Love2019]. +If you end up using a larger dataset and `rlog()` transformation takes a bit too long, you can switch to using `vst()` with confidence since they yield similar results given the dataset is large enough [@Love2019]. + +### Further resources for DESeq2 + +- [StatsQuest: DESeq2, part 1, Library Normalization](https://www.youtube.com/watch?v=UFB993xufUU) [@Starmer2017-deseq2]. +- [DESeq2 vignette: Analyzing RNA-seq data with DESeq2](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html) [@Love2014]. +- [Beginner's guide to DESeq2](Bhttps://bioc.ism.ac.jp/packages/2.14/bioc/vignettes/DESeq2/inst/doc/beginner.pdf) [@Love2014-guide]. +- [Introduction to DGE - Count Normalization](https://hbctraining.github.io/DGE_workshop_salmon/lessons/02_DGE_count_normalization.html) [@dge-workshop-count-normalization] +- [Introduction to DGE - Using DESeq2](https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html) [@dge-workshop-deseq2]. +- [RNA-seq workflow with DESeq2](http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#the-variance-stabilizing-transformation-and-the-rlog) + +#### Why isn't the gene I care about in a refine.bio dataset? + +If a gene is not detected in any of the samples in a set through our processing, it is still kept in the `Gene` column, but it will be reported as `0`. +But if the gene you are interested in does not have an Ensembl ID according to the [version of the annotation](TODO: Put link to refine.bio docs FAQ when https://github.com/AlexsLemonade/refinebio-docs/issues/137 is addressed) we used, it will not be reported in any refine.bio download. + +#### What about edgeR? + +In short, both edgeR and DESeq2 are good options and we at the CCDL just went with one of our preferences! [See this blog that summarizes these – by one of the creators of DESeq2](https://mikelove.wordpress.com/2016/09/28/deseq2-or-edger/) – he agrees edgeR is also great. + +If you have strong preferences for edgeR, you can definitely use your refine.bio data with it, but we currently do not have examples of that. +In this case, we'd refer you to [edgeR's section of this example analysis](https://kasperdanielhansen.github.io/genbioconductor/html/Count_Based_RNAseq.html) and wish you the best of luck on your data adventures [@count-based]! + +#### What if I care about isoforms? + +Unfortunately at this time, all download-ready refine.bio data is summarized to the gene level, and there's no great way to examine isoforms with this data. +If your research needs to know transcript isoform information, you may need to look elsewhere. +This [paper discusses some tools for these kinds of questions](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-017-4002-1) [@Zhang2017]. + + + +## References diff --git a/03-rnaseq/00-intro-to-rnaseq.html b/03-rnaseq/00-intro-to-rnaseq.html index 6e947fbd..028d3d33 100644 --- a/03-rnaseq/00-intro-to-rnaseq.html +++ b/03-rnaseq/00-intro-to-rnaseq.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1663,7 +1668,173 @@

    CCDL for ALSF

    - + + +

    Table of Contents generated with DocToc

    + + +
    +

    0.1 Introduction to RNA-seq technology

    +

    Data analyses are generally not “one size fits all”; this is particularly true between RNA-seq vs microarray data. This tutorial has example analyses organized by technology so you can follow examples that are more closely tailored to the nature of the data at hand.

    +

    As with all experimental methods, RNA-seq has strengths and limitations that you should consider in regards to your scientific questions.

    +
    +

    0.1.1 RNA-seq data strengths

    +
      +
    • RNA-seq can assay unknown transcripts, as it is not bound to a pre-determined set of probes like microarrays (Wang et al.).
    • +
    • Its values are considered more dynamic than microarray values which are constrained to a smaller range based on background signal and probesets being saturated (Wang et al.).
    • +
    +
    +
    +

    0.1.2 RNA-seq data limitations/biases

    +

    The nature of sequencing introduces several different kinds of biases:

    +
      +
    • GC bias: higher GC content sequences are less likely to be observed.
      +
    • +
    • 3’ bias (positional bias): for most sequencing methods, the 3 prime end of transcripts are more likely to be observed.
      +
    • +
    • Complexity bias: some sequences are easier to be bound and amplified than others.
      +
    • +
    • Library size or sequencing depth: the total number of reads is not always equivalent between samples.
      +
    • +
    • Gene length: longer genes are more likely to be observed.
    • +
    +

    Love (2016) discusses these biases in this blog post which includes this handy figure.

    +

    +

    Most normalization methods, including refine.bio’s processing methods, attempt to mitigate these biases, but these biases can never be fully negated. Some of these biases have been addressed to the extent that they can by our refine.bio processing methods so you don’t have to worry too much about them. In brief, refine.bio data is quantified by Salmon using their correction algorithms: --seqbias and --gcbias.

    +
    +
    +

    0.1.3 About quantile normalization

    +

    refine.bio data is available for you quantile normalized, which can address some library size biases. But more often than not, our example modules will recommend using the option for downloading non-quantile normalized data (note that this is RNA-seq specific, and microarray data does not have this download option).

    +

    +

    See here for more about the quantile normalization process in refine.bio.

    +
    + +
    +
    +

    0.2 About DESeq2

    +

    DESeq2 is an R package that is built for differential expression analysis but also has other useful functions for prepping RNA-seq counts data (Love et al. 2014). Our refine.bio data is summarized to the gene-level with tximport before you download it (Soneson et al. 2015) which is also from the same creators as DESeq2, so they are made to play well together. We generally like DESeq2 because it has great documentation and helpful tutorials.

    +
    +

    0.2.1 DESeq2 objects

    +

    Many R Bioconductor packages have specialized object types they want your data to be formatted as. For DESeq2, before we can use a lot the special functions, we need to get our data into a DESeqDataSet object. DESeqDataSet objects not only store your data, but additional transformations of your data, model information, etc.

    +

    From our refine.bio datasets, we will use a function DESeqDataSetFromMatrix() to create our DESeqDataSet objects. This DESeq2 function requires you provide counts and not a normalized or corrected value like TPMs. Which is why our examples advise downloading non-quantile normalized from refine.bio.

    +
    +
    +

    0.2.2 DESeq2 transformation methods

    +

    Our examples recommend using DESeq2 for normalizing your RNA-seq data. You may have heard about or worked with FPKM, TPM, RPKMs; how does DESeq2’s normalization compare? This handy table from an online Harvard Bioinformatics Core course nicely summarizes and compares these different methods (Harvard Chan Bioinformatics Core (HBC)). For more about the steps behind DESeq2 normalization, we highly recommend this StatsQuest video which explains it quite nicely (Josh Starmer 2017b).

    +

    To normalize and transform our data with DESeq2, we generally use vst() (variance stabilizing transformation) or rlog() (regularized logarithm transformation). Both methods are very similar. Both normalize your data by correcting for library size differences but they also transform your data removing the dependence of the variance on the mean, meaning that low mean genes won’t have inflated variance from just one or a few samples having higher values than the rest (Michael I. Love, Simon Anders, and Wolfgang Huber 2020). Of the two methods, rlog() takes a bit longer to run (Michael I. Love and Huber 2019). If you end up using a larger dataset and rlog() transformation takes a bit too long, you can switch to using vst() with confidence since they yield similar results given the dataset is large enough (Michael I. Love and Huber 2019).

    +
    +
    +

    0.2.3 Further resources for DESeq2

    + +
    +

    0.2.3.1 Why isn’t the gene I care about in a refine.bio dataset?

    +

    If a gene is not detected in any of the samples in a set through our processing, it is still kept in the Gene column, but it will be reported as 0. But if the gene you are interested in does not have an Ensembl ID according to the version of the annotation we used, it will not be reported in any refine.bio download.

    +
    +
    +

    0.2.3.2 What about edgeR?

    +

    In short, both edgeR and DESeq2 are good options and we at the CCDL just went with one of our preferences! See this blog that summarizes these – by one of the creators of DESeq2 – he agrees edgeR is also great.

    +

    If you have strong preferences for edgeR, you can definitely use your refine.bio data with it, but we currently do not have examples of that. In this case, we’d refer you to edgeR’s section of this example analysis and wish you the best of luck on your data adventures (Kasper D. Hansen)!

    +
    +
    +

    0.2.3.3 What if I care about isoforms?

    +

    Unfortunately at this time, all download-ready refine.bio data is summarized to the gene level, and there’s no great way to examine isoforms with this data. If your research needs to know transcript isoform information, you may need to look elsewhere. This paper discusses some tools for these kinds of questions (Zhang et al. 2017).

    + +
    +
    +
    +
    +

    References

    +
    +
    +

    Hansen K. D., S. E. Brenner, and S. Dudoit, 2010 Biases in Illumina transcriptome sequencing caused by random hexamer priming. Nucleic Acids Res. 38: e131.

    +
    +
    +

    Harvard Chan Bioinformatics Core (HBC), Introduction to dge - deseq2 analysis

    +
    +
    +

    Harvard Chan Bioinformatics Core (HBC), Introduction to dge - count normalization

    +
    +
    +

    James Hadfield, 2016 An introduction to rna-seq

    +
    +
    +

    Josh Starmer, 2017a StatQuest: A gentle introduction to rna-seq

    +
    +
    +

    Josh Starmer, 2017b StatQuest: DESeq2, part 1, library normalization

    +
    +
    +

    Kasper D. Hansen, Count based rna-seq analysis

    +
    +
    +

    Love M. I., W. Huber, and S. Anders, 2014 Moderated estimation of fold change and dispersion for rna-seq data with deseq2. Genome Biology 15. https://doi.org/10.1186/s13059-014-0550-8

    +
    +
    +

    Love M. I., 2016 RNA-seq fragment sequence bias

    +
    +
    +

    Love M. I., J. B. Hogenesch, and R. A. Irizarry, 2016 Modeling of RNA-seq fragment sequence bias reduces systematic errors in transcript abundance estimation. Nat. Biotechnol. 34: 1287–1291.

    +
    +
    +

    Michael I. Love Simon Anders, and W. Huber, 2014 Beginner’s guide to using the deseq2 package

    +
    +
    +

    Michael I. Love V. K. Simon Anders, and W. Huber, 2019 RNA-seq workflow: Gene-level exploratory analysis and differential expression

    +
    +
    +

    Michael I. Love, Simon Anders, and Wolfgang Huber, 2020 Analyzing rna-seq data with deseq2

    +
    +
    +

    Pepke S., B. Wold, and A. Mortazavi, 2009 Computation for ChIP-seq and RNA-seq studies. Nat. Methods 6: 22–32.

    +
    +
    +

    Soneson C., M. I. Love, and M. D. Robinson, 2015 Differential analyses for rna-seq: Transcript-level estimates improve gene-level inferences. F1000Research 4. https://doi.org/10.12688/f1000research.7563.1

    +
    +
    +

    Wang Z., M. Gerstein, and M. Snyder, RNA-seq: A revolutionary tool for transcriptomics. Nature reviews. Genetics 10: 57–63. https://doi.org/10.1038/nrg2484

    +
    +
    +

    Zhang C., B. Zhang, L. L. Lin, and S. Zhao, 2017 Evaluation and comparison of computational tools for RNA-seq isoform quantification. BMC Genomics 18: 583.

    +
    +
    +
    @@ -1715,7 +1886,7 @@

    CCDL for ALSF

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/03-rnaseq/clustering_rnaseq_01_heatmap.Rmd b/03-rnaseq/clustering_rnaseq_01_heatmap.Rmd index 73eb8556..eda37a11 100644 --- a/03-rnaseq/clustering_rnaseq_01_heatmap.Rmd +++ b/03-rnaseq/clustering_rnaseq_01_heatmap.Rmd @@ -228,7 +228,7 @@ Now we are going to use a combination of functions from the `DESeq2` and `pheatm ## Create a DESeqDataset We will be using the `DESeq2` package for normalization, which requires us to format our data into a `DESeqDataSet` object. -We turn the data frame (or matrix) into a [`DESeqDataSet` object](TODO: Add link to header section) and specify which variable labels our experimental groups using the [`design` argument](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#multi-factor-designs) [@Love2014]. +We turn the data frame (or matrix) into a [`DESeqDataSet` object](https://alexslemonade.github.io/refinebio-examples/03-rnaseq/00-intro-to-rnaseq.html#02_About_DESeq2). ) and specify which variable labels our experimental groups using the [`design` argument](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#multi-factor-designs) [@Love2014]. In this chunk of code, we will not provide a specific model to the `design` argument because we are not performing a differential expression analysis. ```{r} @@ -261,7 +261,7 @@ dds <- dds[genes_to_keep, ] ## Perform DESeq2 normalization We are going to use the `rlog()` function from the `DESeq2` package to normalize the data. -TODO: Add a note/link to section in documentation on why we use `rlog` vs `vst` and `normTransform`. +For more information about these transformation methods, [see here](https://alexslemonade.github.io/refinebio-examples/03-rnaseq/00-intro-to-rnaseq.html#deseq2-transformation-methods). ```{r} # Normalize the data in the `DESeqDataSet` object using the `rlog()` function from the `DESEq2` R package diff --git a/03-rnaseq/clustering_rnaseq_01_heatmap.html b/03-rnaseq/clustering_rnaseq_01_heatmap.html index 20def9d8..f07f8733 100644 --- a/03-rnaseq/clustering_rnaseq_01_heatmap.html +++ b/03-rnaseq/clustering_rnaseq_01_heatmap.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1835,7 +1840,6 @@

    4.1 Install libraries

    ## The following objects are masked from 'package:Biobase':
     ## 
     ##     anyMissing, rowMedians
    -
    ## Loading required package: BiocParallel
    ## 
     ## Attaching package: 'DelayedArray'
    ## The following objects are masked from 'package:matrixStats':
    @@ -1890,12 +1894,12 @@ 

    4.2 Import and set up data

    ## # A tibble: 6 x 20
     ##   refinebio_acces… experiment_acce… refinebio_age refinebio_cell_…
     ##   <chr>            <chr>            <lgl>         <lgl>           
    -## 1 SRR3189684       SRP070849        NA            NA              
    -## 2 SRR3189690       SRP070849        NA            NA              
    -## 3 SRR3189691       SRP070849        NA            NA              
    -## 4 SRR3189683       SRP070849        NA            NA              
    -## 5 SRR3189693       SRP070849        NA            NA              
    -## 6 SRR3189692       SRP070849        NA            NA              
    +## 1 SRR3189694       SRP070849        NA            NA              
    +## 2 SRR3189680       SRP070849        NA            NA              
    +## 3 SRR3189690       SRP070849        NA            NA              
    +## 4 SRR3189684       SRP070849        NA            NA              
    +## 5 SRR3189683       SRP070849        NA            NA              
    +## 6 SRR3189691       SRP070849        NA            NA              
     ## # … with 16 more variables: refinebio_compound <lgl>, refinebio_disease <lgl>,
     ## #   refinebio_disease_stage <lgl>, refinebio_genetic_information <lgl>,
     ## #   refinebio_organism <chr>, refinebio_platform <chr>,
    @@ -1914,7 +1918,7 @@ 

    4.2 Import and set up data

    4.3 Create a DESeqDataset

    -

    We will be using the DESeq2 package for normalization, which requires us to format our data into a DESeqDataSet object. We turn the data frame (or matrix) into a DESeqDataSet object and specify which variable labels our experimental groups using the design argument (Love et al. 2014). In this chunk of code, we will not provide a specific model to the design argument because we are not performing a differential expression analysis.

    +

    We will be using the DESeq2 package for normalization, which requires us to format our data into a DESeqDataSet object. We turn the data frame (or matrix) into a DESeqDataSet object. ) and specify which variable labels our experimental groups using the design argument (Love et al. 2014). In this chunk of code, we will not provide a specific model to the design argument because we are not performing a differential expression analysis.

    # The `DESeqDataSetFromMatrix()` function needs the values to be converted to integers
     df <- df %>%
       # Mutate numeric variables to be integers
    @@ -1938,7 +1942,7 @@ 

    4.4 Define a minimum counts cutof

    4.5 Perform DESeq2 normalization

    -

    We are going to use the rlog() function from the DESeq2 package to normalize the data. TODO: Add a note/link to section in documentation on why we use rlog vs vst and normTransform.

    +

    We are going to use the rlog() function from the DESeq2 package to normalize the data. For more information about these transformation methods, see here.

    # Normalize the data in the `DESeqDataSet` object using the `rlog()` function from the `DESEq2` R package
     dds_norm <- rlog(dds)
    @@ -1974,7 +1978,7 @@

    4.7 Create a heatmap

    ), scale = "row" # Scale values in the direction of genes (rows) )

    -

    +

    We’ve created a heatmap but although our genes and samples are clustered, there is not much information that we can gather here because we did not provide the pheatmap() function with annotation labels for our samples.

    First let’s save our clustered heatmap.

    @@ -1991,8 +1995,8 @@

    4.7.1 Save heatmap as a png

    # Close the png file: dev.off()
    -
    ## quartz_off_screen 
    -##                 2
    +
    ## png 
    +##   2

    Now, let’s add some annotation bars to our heatmap.

    @@ -2038,7 +2042,7 @@

    4.8.1 Create annotated heatmap -

    +

    Now that we have annotation bars on our heatmap, we have a better idea of the sample variable groups that appear to cluster together.

    Let’s save our annotated heatmap.

    @@ -2056,8 +2060,8 @@

    4.8.2 Save annotated heatmap as a # Close the png file: dev.off() -
    ## quartz_off_screen 
    -##                 2
    +
    ## png 
    +##   2

    @@ -2073,58 +2077,59 @@

    6 Session info

    At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

    # Print session info
     sessionInfo()
    -
    ## R version 3.6.1 (2019-07-05)
    -## Platform: x86_64-apple-darwin15.6.0 (64-bit)
    -## Running under: macOS Mojave 10.14.6
    +
    ## R version 4.0.2 (2020-06-22)
    +## Platform: x86_64-pc-linux-gnu (64-bit)
    +## Running under: Ubuntu 20.04 LTS
     ## 
     ## Matrix products: default
    -## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
    -## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
    +## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
     ## 
     ## locale:
    -## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
    +##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    +##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    +##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    +##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    +##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    +## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     ## 
     ## attached base packages:
     ## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
     ## [8] methods   base     
     ## 
     ## other attached packages:
    -##  [1] magrittr_1.5                DESeq2_1.26.0              
    -##  [3] SummarizedExperiment_1.15.6 DelayedArray_0.11.4        
    -##  [5] BiocParallel_1.19.2         matrixStats_0.54.0         
    -##  [7] Biobase_2.45.0              GenomicRanges_1.37.14      
    -##  [9] GenomeInfoDb_1.21.1         IRanges_2.19.10            
    -## [11] S4Vectors_0.23.23           BiocGenerics_0.31.5        
    -## [13] pheatmap_1.0.12             optparse_1.6.2             
    +##  [1] magrittr_1.5                DESeq2_1.28.1              
    +##  [3] SummarizedExperiment_1.18.2 DelayedArray_0.14.1        
    +##  [5] matrixStats_0.56.0          Biobase_2.48.0             
    +##  [7] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
    +##  [9] IRanges_2.22.2              S4Vectors_0.26.1           
    +## [11] BiocGenerics_0.34.0         pheatmap_1.0.12            
    +## [13] optparse_1.6.6             
     ## 
     ## loaded via a namespace (and not attached):
    -##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
    -##  [4] tools_3.6.1            backports_1.1.7        utf8_1.1.4            
    -##  [7] R6_2.4.1               rpart_4.1-15           Hmisc_4.2-0           
    -## [10] DBI_1.0.0              lazyeval_0.2.2         colorspace_1.4-1      
    -## [13] nnet_7.3-12            withr_2.2.0            tidyselect_0.2.5      
    -## [16] gridExtra_2.3          bit_1.1-14             compiler_3.6.1        
    -## [19] cli_2.0.2              htmlTable_1.13.1       scales_1.0.0          
    -## [22] checkmate_1.9.4        readr_1.3.1            genefilter_1.67.1     
    -## [25] stringr_1.4.0          digest_0.6.25          foreign_0.8-72        
    -## [28] rmarkdown_1.14         XVector_0.25.0         base64enc_0.1-3       
    -## [31] pkgconfig_2.0.3        htmltools_0.3.6        styler_1.1.1          
    -## [34] htmlwidgets_1.3        rlang_0.4.6            rstudioapi_0.11       
    -## [37] RSQLite_2.1.2          acepack_1.4.1          dplyr_0.8.3           
    -## [40] RCurl_1.95-4.12        GenomeInfoDbData_1.2.1 Formula_1.2-3         
    -## [43] Matrix_1.2-17          Rcpp_1.0.4.6           munsell_0.5.0         
    -## [46] fansi_0.4.1            lifecycle_0.2.0        stringi_1.4.6         
    -## [49] yaml_2.2.1             zlibbioc_1.31.0        grid_3.6.1            
    -## [52] blob_1.2.0             crayon_1.3.4           lattice_0.20-38       
    -## [55] splines_3.6.1          annotate_1.63.0        hms_0.5.3             
    -## [58] locfit_1.5-9.1         knitr_1.28             pillar_1.4.4          
    -## [61] geneplotter_1.63.0     XML_3.98-1.20          glue_1.4.1            
    -## [64] evaluate_0.14          latticeExtra_0.6-28    data.table_1.12.2     
    -## [67] vctrs_0.3.1            gtable_0.3.0           getopt_1.20.3         
    -## [70] purrr_0.3.4            rematch2_2.1.0         assertthat_0.2.1      
    -## [73] ggplot2_3.2.1          xfun_0.14              xtable_1.8-4          
    -## [76] survival_2.44-1.1      tibble_3.0.1           memoise_1.1.0         
    -## [79] AnnotationDbi_1.47.0   cluster_2.1.0          ellipsis_0.3.1
    +## [1] bit64_0.9-7.1 splines_4.0.2 R.utils_2.9.2 +## [4] assertthat_0.2.1 blob_1.2.1 GenomeInfoDbData_1.2.3 +## [7] yaml_2.2.1 pillar_1.4.6 RSQLite_2.2.0 +## [10] backports_1.1.9 lattice_0.20-41 glue_1.4.1 +## [13] digest_0.6.25 RColorBrewer_1.1-2 XVector_0.28.0 +## [16] colorspace_1.4-1 htmltools_0.5.0 Matrix_1.2-18 +## [19] R.oo_1.23.0 XML_3.99-0.5 pkgconfig_2.0.3 +## [22] genefilter_1.70.0 zlibbioc_1.34.0 purrr_0.3.4 +## [25] xtable_1.8-4 scales_1.1.1 getopt_1.20.3 +## [28] BiocParallel_1.22.0 tibble_3.0.3 annotate_1.66.0 +## [31] styler_1.3.2 farver_2.0.3 generics_0.0.2 +## [34] ggplot2_3.3.2 ellipsis_0.3.1 cli_2.0.2 +## [37] survival_3.1-12 crayon_1.3.4 memoise_1.1.0 +## [40] evaluate_0.14 R.methodsS3_1.8.0 fansi_0.4.1 +## [43] R.cache_0.14.0 tools_4.0.2 hms_0.5.3 +## [46] lifecycle_0.2.0 stringr_1.4.0 locfit_1.5-9.4 +## [49] munsell_0.5.0 AnnotationDbi_1.50.3 compiler_4.0.2 +## [52] rlang_0.4.7 grid_4.0.2 RCurl_1.98-1.2 +## [55] rstudioapi_0.11 bitops_1.0-6 rmarkdown_2.3 +## [58] gtable_0.3.0 DBI_1.1.0 rematch2_2.1.2 +## [61] R6_2.4.1 knitr_1.29 dplyr_1.0.0 +## [64] utf8_1.1.4 bit_1.1-15.2 readr_1.3.1 +## [67] stringi_1.4.6 Rcpp_1.0.5 vctrs_0.3.2 +## [70] geneplotter_1.66.0 tidyselect_1.1.0 xfun_0.16

    Gu Z., R. Eils, and M. Schlesner, 2016 Complex heatmaps reveal patterns and correlations in multidimensional genomic data. Bioinformatics.

    @@ -2191,7 +2196,7 @@

    6 Session info

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/03-rnaseq/differential-expression_rnaseq_01.html b/03-rnaseq/differential-expression_rnaseq_01.html index 6509daeb..87faf59e 100644 --- a/03-rnaseq/differential-expression_rnaseq_01.html +++ b/03-rnaseq/differential-expression_rnaseq_01.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1835,7 +1840,6 @@

    4.1 Install libraries

    ## The following objects are masked from 'package:Biobase':
     ## 
     ##     anyMissing, rowMedians
    -
    ## Loading required package: BiocParallel
    ## 
     ## Attaching package: 'DelayedArray'
    ## The following objects are masked from 'package:matrixStats':
    @@ -1907,8 +1911,8 @@ 

    4.2 Import data and metadata

    ## [1] TRUE

    The information we need to make the comparison is in the refinebio_title column of the metadata data.frame.

    head(metadata$refinebio_title)
    -
    ## [1] "CBF369-Blood-ASXL2"   "41267-BM-ASXL2"       "CBF234-BM-ASXLwt"    
    -## [4] "40810-BM-ASXLwt"      "49060-2010-BM-ASXLwt" "CBF77-Blood-ASXL2"
    +
    ## [1] "CBF54-BM-ASXLwt"      "49060-2010-BM-ASXLwt" "CBF234-BM-ASXLwt"    
    +## [4] "CBF124-BM-ASXLwt"     "41267-BM-ASXL2"       "45565-BM-ASXL1"

    4.3 Set up metadata

    @@ -1932,16 +1936,16 @@

    4.3 Set up metadata

    ## # A tibble: 6 x 2
     ##   refinebio_title      asxl_mutation_status
     ##   <chr>                <chr>               
    -## 1 CBF369-Blood-ASXL2   asxl_mutation       
    -## 2 41267-BM-ASXL2       asxl_mutation       
    +## 1 CBF54-BM-ASXLwt      no_mutation         
    +## 2 49060-2010-BM-ASXLwt no_mutation         
     ## 3 CBF234-BM-ASXLwt     no_mutation         
    -## 4 40810-BM-ASXLwt      no_mutation         
    -## 5 49060-2010-BM-ASXLwt no_mutation         
    -## 6 CBF77-Blood-ASXL2    asxl_mutation
    +## 4 CBF124-BM-ASXLwt no_mutation +## 5 41267-BM-ASXL2 asxl_mutation +## 6 45565-BM-ASXL1 asxl_mutation

    Before we set up our model in the next step, we want to check if our modeling variable is set correctly. We want our “control” to to be set as the first level in the variable we provide as our experimental variable. Here we will use the str() function to print out a preview of the structure of our variable

    # Print out a preview of `asxl_mutation_status`
     str(metadata$asxl_mutation_status)
    -
    ##  chr [1:16] "asxl_mutation" "asxl_mutation" "no_mutation" "no_mutation" ...
    +
    ##  chr [1:16] "no_mutation" "no_mutation" "no_mutation" "no_mutation" ...

    Currently, asxl_mutation_status is a character. To make sure it is set how we want for the DESeq object and subsequent testing, let’s mutate it to a factor so we can explicitly set the levels.

    # Make asxl_mutation_status a factor and set the levels appropriately
     metadata <- metadata %>%
    @@ -1997,32 +2001,23 @@ 

    4.6 Run differential expression a coef = 2, # This is based on what log fold change coefficient was used in DESeq(), the default is 2. res = deseq_results # This needs to be the DESeq results table )

    -
    ## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
    -## 
    -## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
    -## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
    -## Reference: https://doi.org/10.1093/bioinformatics/bty895
    +
    ## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    +##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    +##     sequence count data: removing the noise and preserving large differences.
    +##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

    Now let’s take a peek at what our results table looks like.

    head(deseq_results)
    ## log2 fold change (MAP): asxl mutation status asxl mutation vs no mutation 
     ## Wald test p-value: asxl mutation status asxl mutation vs no mutation 
    -## DataFrame with 6 rows and 6 columns
    -##                          baseMean      log2FoldChange             lfcSE
    -##                         <numeric>           <numeric>         <numeric>
    -## ENSG00000000003  52.8520587655854    0.12593859821352 0.292427834964723
    -## ENSG00000000005 0.260055561421088  0.0412286839932318 0.143323523204828
    -## ENSG00000000419  406.161355152407 -0.0872933065549999 0.232613912574037
    -## ENSG00000000457  564.784020901503  -0.239724224515893  0.26943713590588
    -## ENSG00000000460  401.130684469072   0.373394326554735 0.267898911424576
    -## ENSG00000000938  1500.52744835278   0.623523965932518 0.304354167130984
    -##                               stat             pvalue              padj
    -##                          <numeric>          <numeric>         <numeric>
    -## ENSG00000000003  0.439280729657335  0.660458135481068 0.998525105718721
    -## ENSG00000000005 -0.554658424129523  0.579128318966208                NA
    -## ENSG00000000419 -0.375033246094286  0.707635741349663 0.998525105718721
    -## ENSG00000000457 -0.892738568914621  0.371997190749564 0.998525105718721
    -## ENSG00000000460   1.39583439943176  0.162764368904366 0.998525105718721
    -## ENSG00000000938   2.10307054824782 0.0354596030191536 0.720600898549108
    +## DataFrame with 6 rows and 5 columns +## baseMean log2FoldChange lfcSE pvalue padj +## <numeric> <numeric> <numeric> <numeric> <numeric> +## ENSG00000000003 52.852059 9.64776e-07 0.00144269 0.6604581 0.998525 +## ENSG00000000005 0.260056 2.03308e-07 0.00144269 0.5791283 NA +## ENSG00000000419 406.161355 -1.38160e-06 0.00144268 0.7076357 0.998525 +## ENSG00000000457 564.784021 -2.36497e-06 0.00144268 0.3719972 0.998525 +## ENSG00000000460 401.130684 3.87280e-06 0.00144269 0.1627644 0.998525 +## ENSG00000000938 1500.527448 3.07435e-06 0.00144269 0.0354596 0.720601

    Note it is not filtered or sorted, so we will use tidyverse to do this before saving our results to a file. Sort and filter the results.

    # this is of class DESeqResults -- we want a data frame
     deseq_df <- deseq_results %>%
    @@ -2037,25 +2032,25 @@ 

    4.6 Run differential expression a dplyr::arrange(dplyr::desc(log2FoldChange))

    Let’s print out what the top results are.

    head(deseq_df)
    -
    ##              Gene    baseMean log2FoldChange     lfcSE       stat       pvalue
    -## 1 ENSG00000115590  152.279126       1.873505 0.3037832  3.3030575 0.0009563678
    -## 2 ENSG00000273079   26.113327       1.687493 0.3045453  2.3191410 0.0203873911
    -## 3 ENSG00000154589   51.259590       1.686259 0.3031344  1.9123102 0.0558364219
    -## 4 ENSG00000164588    9.276832       1.659394 0.2705837 -0.3576455 0.7206086031
    -## 5 ENSG00000103569 1218.109594       1.636066 0.3035410  3.3306457 0.0008664481
    -## 6 ENSG00000244115   92.694075       1.571892 0.2753955  1.3663784 0.1718202238
    -##        padj threshold
    -## 1 0.1782265     FALSE
    -## 2 0.6263925     FALSE
    -## 3 0.8208543     FALSE
    -## 4        NA        NA
    -## 5 0.1690009     FALSE
    -## 6 0.9985251     FALSE
    +
    ##              Gene   baseMean log2FoldChange     lfcSE       pvalue      padj
    +## 1 ENSG00000049247  19.204037       6.174579 2.2940036 0.0167489785 0.5761216
    +## 2 ENSG00000162551 179.261210       5.824919 1.2427084 0.0008553285 0.1690009
    +## 3 ENSG00000273079  26.113327       4.172007 0.7078167 0.0203873911 0.6263925
    +## 4 ENSG00000148926  96.538019       3.952953 0.8327353 0.0284780003 0.6768306
    +## 5 ENSG00000251139   5.018181       3.867049 0.8624662 0.0270431735        NA
    +## 6 ENSG00000108244  17.988423       3.731020 0.9987549 0.0012635570 0.2025067
    +##   threshold
    +## 1     FALSE
    +## 2     FALSE
    +## 3     FALSE
    +## 4     FALSE
    +## 5        NA
    +## 6     FALSE

    4.6.1 Check results by plotting one gene

    To double check what a differentially expressed gene looks like, we can plot one with DESeq2::plotCounts() function.

    plotCounts(ddset, gene = "ENSG00000196074", intgroup = "asxl_mutation_status")
    -

    +

    The mutation group samples have higher expression of this gene than the control group, which helps assure us that the results are showing us what we are looking for.

    @@ -2079,7 +2074,7 @@

    4.8 Create a volcano plot

    x = "log2FoldChange", # The variable in `deseq_df` you want to be plotted on the x axis y = "padj" # The variable in `deseq_df` you want to be plotted on the y axis ) -

    +

    Here the red point is the gene that meets both the default p value and log2 fold change cutoff (which are 10e-6 and 1 respectively).

    We used the adjusted p values for our plot above, so you may want to loosen this cutoff with the pCutoff argument (Take a look at all the options for tailoring this plot using ?EnhancedVolcano).

    Let’s make the same plot again, but adjust the pCutoff since we are using multiple-testing corrected p values and this time we will assign the plot to our environment as volcano_plot.

    @@ -2094,7 +2089,7 @@

    4.8 Create a volcano plot

    # Print out plot here volcano_plot -

    +

    This looks pretty good. Let’s save it to a PNG.

    ggsave(
       plot = volcano_plot,
    @@ -2118,59 +2113,63 @@ 

    6 Session info

    At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

    # Print session info
     sessionInfo()
    -
    ## R version 3.6.1 (2019-07-05)
    -## Platform: x86_64-apple-darwin15.6.0 (64-bit)
    -## Running under: macOS Mojave 10.14.6
    +
    ## R version 4.0.2 (2020-06-22)
    +## Platform: x86_64-pc-linux-gnu (64-bit)
    +## Running under: Ubuntu 20.04 LTS
     ## 
     ## Matrix products: default
    -## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
    -## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
    +## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
     ## 
     ## locale:
    -## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
    +##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    +##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    +##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    +##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    +##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    +## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     ## 
     ## attached base packages:
     ## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
     ## [8] methods   base     
     ## 
     ## other attached packages:
    -##  [1] magrittr_1.5                ggplot2_3.2.1              
    -##  [3] DESeq2_1.26.0               SummarizedExperiment_1.15.6
    -##  [5] DelayedArray_0.11.4         BiocParallel_1.19.2        
    -##  [7] matrixStats_0.54.0          Biobase_2.45.0             
    -##  [9] GenomicRanges_1.37.14       GenomeInfoDb_1.21.1        
    -## [11] IRanges_2.19.10             S4Vectors_0.23.23          
    -## [13] BiocGenerics_0.31.5         optparse_1.6.2             
    +##  [1] magrittr_1.5                ggplot2_3.3.2              
    +##  [3] DESeq2_1.28.1               SummarizedExperiment_1.18.2
    +##  [5] DelayedArray_0.14.1         matrixStats_0.56.0         
    +##  [7] Biobase_2.48.0              GenomicRanges_1.40.0       
    +##  [9] GenomeInfoDb_1.24.2         IRanges_2.22.2             
    +## [11] S4Vectors_0.26.1            BiocGenerics_0.34.0        
    +## [13] optparse_1.6.6             
     ## 
     ## loaded via a namespace (and not attached):
    -##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
    -##  [4] tools_3.6.1            backports_1.1.7        utf8_1.1.4            
    -##  [7] R6_2.4.1               rpart_4.1-15           Hmisc_4.2-0           
    -## [10] DBI_1.0.0              lazyeval_0.2.2         colorspace_1.4-1      
    -## [13] nnet_7.3-12            withr_2.2.0            tidyselect_0.2.5      
    -## [16] gridExtra_2.3          bit_1.1-14             compiler_3.6.1        
    -## [19] cli_2.0.2              htmlTable_1.13.1       labeling_0.3          
    -## [22] scales_1.0.0           checkmate_1.9.4        readr_1.3.1           
    -## [25] genefilter_1.67.1      stringr_1.4.0          digest_0.6.25         
    -## [28] foreign_0.8-72         rmarkdown_1.14         XVector_0.25.0        
    -## [31] base64enc_0.1-3        pkgconfig_2.0.3        htmltools_0.3.6       
    -## [34] styler_1.1.1           htmlwidgets_1.3        rlang_0.4.6           
    -## [37] rstudioapi_0.11        RSQLite_2.1.2          acepack_1.4.1         
    -## [40] dplyr_0.8.3            RCurl_1.95-4.12        GenomeInfoDbData_1.2.1
    -## [43] Formula_1.2-3          Matrix_1.2-17          Rcpp_1.0.4.6          
    -## [46] munsell_0.5.0          fansi_0.4.1            lifecycle_0.2.0       
    -## [49] stringi_1.4.6          yaml_2.2.1             zlibbioc_1.31.0       
    -## [52] grid_3.6.1             blob_1.2.0             ggrepel_0.8.1         
    -## [55] crayon_1.3.4           lattice_0.20-38        splines_3.6.1         
    -## [58] annotate_1.63.0        hms_0.5.3              locfit_1.5-9.1        
    -## [61] knitr_1.28             pillar_1.4.4           geneplotter_1.63.0    
    -## [64] XML_3.98-1.20          glue_1.4.1             evaluate_0.14         
    -## [67] latticeExtra_0.6-28    data.table_1.12.2      EnhancedVolcano_1.4.0 
    -## [70] vctrs_0.3.1            gtable_0.3.0           getopt_1.20.3         
    -## [73] purrr_0.3.4            rematch2_2.1.0         assertthat_0.2.1      
    -## [76] xfun_0.14              xtable_1.8-4           survival_2.44-1.1     
    -## [79] tibble_3.0.1           AnnotationDbi_1.47.0   memoise_1.1.0         
    -## [82] cluster_2.1.0          ellipsis_0.3.1
    +## [1] bitops_1.0-6 bit64_0.9-7.1 RColorBrewer_1.1-2 +## [4] numDeriv_2016.8-1.1 R.cache_0.14.0 tools_4.0.2 +## [7] backports_1.1.9 utf8_1.1.4 R6_2.4.1 +## [10] DBI_1.1.0 colorspace_1.4-1 apeglm_1.10.0 +## [13] withr_2.2.0 tidyselect_1.1.0 bit_1.1-15.2 +## [16] compiler_4.0.2 cli_2.0.2 labeling_0.3 +## [19] scales_1.1.1 mvtnorm_1.1-1 readr_1.3.1 +## [22] genefilter_1.70.0 stringr_1.4.0 digest_0.6.25 +## [25] rmarkdown_2.3 R.utils_2.9.2 XVector_0.28.0 +## [28] pkgconfig_2.0.3 htmltools_0.5.0 styler_1.3.2 +## [31] bbmle_1.0.23.1 rlang_0.4.7 rstudioapi_0.11 +## [34] RSQLite_2.2.0 farver_2.0.3 generics_0.0.2 +## [37] BiocParallel_1.22.0 dplyr_1.0.0 R.oo_1.23.0 +## [40] RCurl_1.98-1.2 GenomeInfoDbData_1.2.3 Matrix_1.2-18 +## [43] Rcpp_1.0.5 munsell_0.5.0 fansi_0.4.1 +## [46] lifecycle_0.2.0 R.methodsS3_1.8.0 stringi_1.4.6 +## [49] yaml_2.2.1 MASS_7.3-51.6 zlibbioc_1.34.0 +## [52] plyr_1.8.6 grid_4.0.2 blob_1.2.1 +## [55] ggrepel_0.8.2 bdsmatrix_1.3-4 crayon_1.3.4 +## [58] lattice_0.20-41 splines_4.0.2 annotate_1.66.0 +## [61] hms_0.5.3 locfit_1.5-9.4 knitr_1.29 +## [64] pillar_1.4.6 geneplotter_1.66.0 XML_3.99-0.5 +## [67] glue_1.4.1 evaluate_0.14 EnhancedVolcano_1.6.0 +## [70] vctrs_0.3.2 gtable_0.3.0 getopt_1.20.3 +## [73] purrr_0.3.4 rematch2_2.1.2 assertthat_0.2.1 +## [76] emdbook_1.3.12 xfun_0.16 xtable_1.8-4 +## [79] coda_0.19-3 survival_3.1-12 tibble_3.0.3 +## [82] AnnotationDbi_1.50.3 memoise_1.1.0 ellipsis_0.3.1

    Blighe K., S. Rana, and M. Lewis, 2020 EnhancedVolcano: Publication-ready volcano plots with enhanced colouring and labeling.

    @@ -2237,7 +2236,7 @@

    6 Session info

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/03-rnaseq/dimension-reduction_rnaseq_01_pca.Rmd b/03-rnaseq/dimension-reduction_rnaseq_01_pca.Rmd index 3f97df3b..3babf6de 100644 --- a/03-rnaseq/dimension-reduction_rnaseq_01_pca.Rmd +++ b/03-rnaseq/dimension-reduction_rnaseq_01_pca.Rmd @@ -11,30 +11,30 @@ output: # Purpose of the analysis -This notebook illustrates one way that you can use RNA-seq data from refine.bio to perform Principal Component Analysis (PCA) and plot the scores using `ggplot2`. +This notebook illustrates one way that you can use RNA-seq data from refine.bio to perform Principal Component Analysis (PCA) and plot the scores using `ggplot2`. ⬇️ [**Jump to the analysis code**](#analysis) ⬇️ # How to run this example For general information about our tutorials and the basic software packages you will need, please see our ['Getting Started' section](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#how-this-tutorial-is-structured). -We recommend taking a look at our [Resources for Learning R](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#resources-for-learning-r) if you have not written code in R before. +We recommend taking a look at our [Resources for Learning R](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#resources-for-learning-r) if you have not written code in R before. ## Obtain the `.Rmd` file of this analysis To run this example yourself, [download the `.Rmd` for this analysis by clicking this link](https://alexslemonade.github.io/refinebio-examples/03-rnaseq/dimension-reduction_rnaseq_01_pca.Rmd). You can open this `.Rmd` file in RStudio and follow the rest of these steps from there. (See our [section about getting started with R notebooks](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#how-to-get-and-use-rmds) if you are unfamiliar with `.Rmd` files.) -Clicking this link will most likely send this to your downloads folder on your computer. +Clicking this link will most likely send this to your downloads folder on your computer. Move this `.Rmd` file to where you would like this example and its files to be stored. ## Set up your analysis folders Good file organization is helpful for keeping your data analysis project on track! -We have set up some code that will automatically set up a folder structure for you. -Run this next chunk to set up your folders! +We have set up some code that will automatically set up a folder structure for you. +Run this next chunk to set up your folders! -If you have trouble running this chunk, see our [introduction to using `.Rmd`s](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#how-to-get-and-use-rmds) for more resources and explanations. +If you have trouble running this chunk, see our [introduction to using `.Rmd`s](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#how-to-get-and-use-rmds) for more resources and explanations. ```{r} # Define the file path to the data directory @@ -85,35 +85,35 @@ Note that this option will only be available for RNA-seq datasets. It may take a few minutes for the dataset to process. -You will get an email when it is ready. +You will get an email when it is ready. ## About the dataset we are using for this example For this example analysis, we will use this [prostate cancer dataset](https://www.refine.bio/experiments/SRP133573). - + The data that we downloaded from refine.bio for this analysis has 175 RNA-seq samples obtained from 20 patients with prostate cancer. Patients underwent androgen deprivation therapy (ADT) and RNA-seq samples include pre-ADT biopsies and post-ADT prostatectomy specimens. ## Place the dataset in your new `data/` folder -Refine.bio will send you a download button in the email when it is ready. -Follow the prompt to download a zip file that has a name with a series of letters and numbers and ends in `.zip`. +Refine.bio will send you a download button in the email when it is ready. +Follow the prompt to download a zip file that has a name with a series of letters and numbers and ends in `.zip`. Double clicking should unzip this for you and create a folder of the same name. - + For more details on the contents of this folder see [these docs on refine.bio](http://docs.refine.bio/en/latest/main_text.html#rna-seq-sample-compendium-download-folder). The `` folder has the data and metadata TSV files you will need for this example analysis. -Experiment accession ids usually look something like `GSE1235` or `SRP12345`. +Experiment accession ids usually look something like `GSE1235` or `SRP12345`. Copy and paste the `SRP133573` folder into your newly created `data/` folder. ## Check out our file structure! -Your new analysis folder should contain: +Your new analysis folder should contain: - The example analysis Rmd you downloaded - A folder called "data" which contains: @@ -122,12 +122,12 @@ Your new analysis folder should contain: - The metadata TSV - A folder for `plots` (currently empty) - A folder for `results` (currently empty) -Your example analysis folder should now look something like this (except with respective experiment accession id and analysis notebook name you are using): +Your example analysis folder should now look something like this (except with respective experiment accession id and analysis notebook name you are using): -In order for our example here to run without a hitch, we need these files to be in these locations so we've constructed a test to check before we get started with the analysis. -Run this chunk to double check that your files are in the right place. +In order for our example here to run without a hitch, we need these files to be in these locations so we've constructed a test to check before we get started with the analysis. +Run this chunk to double check that your files are in the right place. ```{r} # Check if the gene expression matrix file is in the data directory stored as `data_dir` @@ -139,13 +139,13 @@ file.exists(file.path(data_dir, "metadata_SRP133573.tsv")) If the chunk above printed out `FALSE` to either of those tests, you won't be able to run this analysis _as is_ until those files are in the appropriate place. -If the concept of a "file path" is unfamiliar to you; we recommend taking a look at our [section about file paths](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#an-important-note-about-file-paths-and-Rmds). +If the concept of a "file path" is unfamiliar to you; we recommend taking a look at our [section about file paths](https://alexslemonade.github.io/refinebio-examples/01-getting-started/getting-started.html#an-important-note-about-file-paths-and-Rmds). # Using a different refine.bio dataset with this analysis? If you'd like to adapt an example analysis to use a different dataset from [refine.bio](https://www.refine.bio/), we recommend placing the files in the `data/` directory you created and changing the filenames and paths in the notebook to match these files (we've put comments to signify where you would need to change the code). We suggest saving plots to the `plots/` and `results/` directories, respectively, as these are automatically created by the notebook. -From here you can customize this analysis example to fit your own scientific questions and preferences. +From here you can customize this analysis example to fit your own scientific questions and preferences. *** @@ -185,7 +185,7 @@ set.seed(12345) ## Import and set up data -Data downloaded from refine.bio include a metadata tab separated values (TSV) file and a data TSV file. +Data downloaded from refine.bio include a metadata tab separated values (TSV) file and a data TSV file. This chunk of code will read the both TSV files and add them as data frames to your environment. ```{r} @@ -204,7 +204,7 @@ df <- readr::read_tsv(file.path( tibble::column_to_rownames("Gene") ``` -Let's ensure that the metadata and data are in the same sample order. +Let's ensure that the metadata and data are in the same sample order. ```{r} # Make the data in the order of the metadata @@ -245,7 +245,7 @@ metadata <- metadata %>% ## Create a DESeqDataset We will be using the `DESeq2` package for normalization, which requires us to format our data into a `DESeqDataSet` object. -We turn the data frame (or matrix) into a [`DESeqDataSet` object](TODO: Add link to header section) and specify which variable labels our experimental groups using the [`design` argument](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#multi-factor-designs) [@Love2014]. +We turn the data frame (or matrix) into a [`DESeqDataSet` object](https://alexslemonade.github.io/refinebio-examples/03-rnaseq/00-intro-to-rnaseq.html#02_About_DESeq2). ) and specify which variable labels our experimental groups using the [`design` argument](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#multi-factor-designs) [@Love2014]. In this chunk of code, we will not provide a specific model to the `design` argument because we are not performing a differential expression analysis. ```{r} @@ -351,8 +351,8 @@ ggsave(file.path(plots_dir, "SRP133573_pca_plot.png"), # Replace with name relev # Print session info -At the end of every analysis, before saving your notebook, we recommend printing out your session info. -This helps make your code more reproducible by recording what versions of softwares and packages you used to run this. +At the end of every analysis, before saving your notebook, we recommend printing out your session info. +This helps make your code more reproducible by recording what versions of softwares and packages you used to run this. ```{r} # Print session info diff --git a/03-rnaseq/dimension-reduction_rnaseq_01_pca.html b/03-rnaseq/dimension-reduction_rnaseq_01_pca.html index f3ae1756..26d40197 100644 --- a/03-rnaseq/dimension-reduction_rnaseq_01_pca.html +++ b/03-rnaseq/dimension-reduction_rnaseq_01_pca.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1829,7 +1834,6 @@

    4.1 Install libraries

    ## The following objects are masked from 'package:Biobase':
     ## 
     ##     anyMissing, rowMedians
    -
    ## Loading required package: BiocParallel
    ## 
     ## Attaching package: 'DelayedArray'
    ## The following objects are masked from 'package:matrixStats':
    @@ -1913,7 +1917,7 @@ 

    4.2.2 Prepare metadata for

    4.3 Create a DESeqDataset

    -

    We will be using the DESeq2 package for normalization, which requires us to format our data into a DESeqDataSet object. We turn the data frame (or matrix) into a DESeqDataSet object and specify which variable labels our experimental groups using the design argument (Love et al. 2014). In this chunk of code, we will not provide a specific model to the design argument because we are not performing a differential expression analysis.

    +

    We will be using the DESeq2 package for normalization, which requires us to format our data into a DESeqDataSet object. We turn the data frame (or matrix) into a DESeqDataSet object. ) and specify which variable labels our experimental groups using the design argument (Love et al. 2014). In this chunk of code, we will not provide a specific model to the design argument because we are not performing a differential expression analysis.

    # Create a `DESeqDataSet` object
     dds <- DESeqDataSetFromMatrix(
       countData = df, # This is the data frame with the counts values for all replicates in our dataset
    @@ -1942,13 +1946,13 @@ 

    4.6 Create PCA plot using DESeq2<
    plotPCA(dds_norm,
       intgroup = "refinebio_treatment"
     )
    -

    +

    In this chunk, we are going to add another variable to our plot for annotation.

    Now we’ll plot the PCA using both refinebio_treatment and refinebio_disease variables for labels since they are central to the androgen deprivation therapy (ADT) based hypothesis in the original paper (Sharma et al. 2018).

    plotPCA(dds_norm,
       intgroup = c("refinebio_treatment", "refinebio_disease") # Note that we are able to add another variable to the intgroup argument here by providing a vector of the variable names with  `c()` function
     )
    -

    +

    In the plot above, it is hard to distinguish the different refinebio_treatment values which contain the data on whether or not samples have been treated with ADT versus the refinebio_disease values which refer to the method by which the samples were obtained from patients (i.e. biopsy).

    Let’s use the ggplot2 package functionality to customize our plot further and make the annotation labels better distinguishable.

    First let’s use plotPCA() to receive and store the PCA values for plotting.

    @@ -1995,58 +1999,59 @@

    6 Print session info

    At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

    # Print session info
     sessionInfo()
    -
    ## R version 3.6.1 (2019-07-05)
    -## Platform: x86_64-apple-darwin15.6.0 (64-bit)
    -## Running under: macOS Mojave 10.14.6
    +
    ## R version 4.0.2 (2020-06-22)
    +## Platform: x86_64-pc-linux-gnu (64-bit)
    +## Running under: Ubuntu 20.04 LTS
     ## 
     ## Matrix products: default
    -## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
    -## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
    +## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
     ## 
     ## locale:
    -## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
    +##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    +##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    +##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    +##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    +##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    +## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     ## 
     ## attached base packages:
     ## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
     ## [8] methods   base     
     ## 
     ## other attached packages:
    -##  [1] magrittr_1.5                ggplot2_3.2.1              
    -##  [3] DESeq2_1.26.0               SummarizedExperiment_1.15.6
    -##  [5] DelayedArray_0.11.4         BiocParallel_1.19.2        
    -##  [7] matrixStats_0.54.0          Biobase_2.45.0             
    -##  [9] GenomicRanges_1.37.14       GenomeInfoDb_1.21.1        
    -## [11] IRanges_2.19.10             S4Vectors_0.23.23          
    -## [13] BiocGenerics_0.31.5         optparse_1.6.2             
    +##  [1] magrittr_1.5                ggplot2_3.3.2              
    +##  [3] DESeq2_1.28.1               SummarizedExperiment_1.18.2
    +##  [5] DelayedArray_0.14.1         matrixStats_0.56.0         
    +##  [7] Biobase_2.48.0              GenomicRanges_1.40.0       
    +##  [9] GenomeInfoDb_1.24.2         IRanges_2.22.2             
    +## [11] S4Vectors_0.26.1            BiocGenerics_0.34.0        
    +## [13] optparse_1.6.6             
     ## 
     ## loaded via a namespace (and not attached):
    -##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
    -##  [4] tools_3.6.1            backports_1.1.7        R6_2.4.1              
    -##  [7] rpart_4.1-15           Hmisc_4.2-0            DBI_1.0.0             
    -## [10] lazyeval_0.2.2         colorspace_1.4-1       nnet_7.3-12           
    -## [13] withr_2.2.0            tidyselect_0.2.5       gridExtra_2.3         
    -## [16] bit_1.1-14             compiler_3.6.1         cli_2.0.2             
    -## [19] htmlTable_1.13.1       labeling_0.3           scales_1.0.0          
    -## [22] checkmate_1.9.4        readr_1.3.1            genefilter_1.67.1     
    -## [25] stringr_1.4.0          digest_0.6.25          foreign_0.8-72        
    -## [28] rmarkdown_1.14         XVector_0.25.0         base64enc_0.1-3       
    -## [31] pkgconfig_2.0.3        htmltools_0.3.6        styler_1.1.1          
    -## [34] htmlwidgets_1.3        rlang_0.4.6            rstudioapi_0.11       
    -## [37] RSQLite_2.1.2          acepack_1.4.1          dplyr_0.8.3           
    -## [40] RCurl_1.95-4.12        GenomeInfoDbData_1.2.1 Formula_1.2-3         
    -## [43] Matrix_1.2-17          Rcpp_1.0.4.6           munsell_0.5.0         
    -## [46] fansi_0.4.1            lifecycle_0.2.0        stringi_1.4.6         
    -## [49] yaml_2.2.1             zlibbioc_1.31.0        grid_3.6.1            
    -## [52] blob_1.2.0             crayon_1.3.4           lattice_0.20-38       
    -## [55] splines_3.6.1          annotate_1.63.0        hms_0.5.3             
    -## [58] locfit_1.5-9.1         knitr_1.28             pillar_1.4.4          
    -## [61] geneplotter_1.63.0     XML_3.98-1.20          glue_1.4.1            
    -## [64] evaluate_0.14          latticeExtra_0.6-28    data.table_1.12.2     
    -## [67] vctrs_0.3.1            gtable_0.3.0           getopt_1.20.3         
    -## [70] purrr_0.3.4            rematch2_2.1.0         assertthat_0.2.1      
    -## [73] xfun_0.14              xtable_1.8-4           survival_2.44-1.1     
    -## [76] tibble_3.0.1           AnnotationDbi_1.47.0   memoise_1.1.0         
    -## [79] cluster_2.1.0          ellipsis_0.3.1
    +## [1] bit64_0.9-7.1 splines_4.0.2 R.utils_2.9.2 +## [4] assertthat_0.2.1 blob_1.2.1 GenomeInfoDbData_1.2.3 +## [7] yaml_2.2.1 pillar_1.4.6 RSQLite_2.2.0 +## [10] backports_1.1.9 lattice_0.20-41 glue_1.4.1 +## [13] digest_0.6.25 RColorBrewer_1.1-2 XVector_0.28.0 +## [16] colorspace_1.4-1 htmltools_0.5.0 Matrix_1.2-18 +## [19] R.oo_1.23.0 XML_3.99-0.5 pkgconfig_2.0.3 +## [22] genefilter_1.70.0 zlibbioc_1.34.0 purrr_0.3.4 +## [25] xtable_1.8-4 scales_1.1.1 getopt_1.20.3 +## [28] BiocParallel_1.22.0 tibble_3.0.3 annotate_1.66.0 +## [31] styler_1.3.2 farver_2.0.3 generics_0.0.2 +## [34] ellipsis_0.3.1 withr_2.2.0 cli_2.0.2 +## [37] survival_3.1-12 crayon_1.3.4 memoise_1.1.0 +## [40] evaluate_0.14 R.methodsS3_1.8.0 fansi_0.4.1 +## [43] R.cache_0.14.0 tools_4.0.2 hms_0.5.3 +## [46] lifecycle_0.2.0 stringr_1.4.0 locfit_1.5-9.4 +## [49] munsell_0.5.0 AnnotationDbi_1.50.3 compiler_4.0.2 +## [52] rlang_0.4.7 grid_4.0.2 RCurl_1.98-1.2 +## [55] rstudioapi_0.11 labeling_0.3 bitops_1.0-6 +## [58] rmarkdown_2.3 gtable_0.3.0 DBI_1.1.0 +## [61] rematch2_2.1.2 R6_2.4.1 dplyr_1.0.0 +## [64] knitr_1.29 bit_1.1-15.2 readr_1.3.1 +## [67] stringi_1.4.6 Rcpp_1.0.5 vctrs_0.3.2 +## [70] geneplotter_1.66.0 tidyselect_1.1.0 xfun_0.16

    Freytag S., 2019 Workshop: Dimension reduction with r

    @@ -2119,7 +2124,7 @@

    6 Print session info

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/03-rnaseq/dimension-reduction_rnaseq_02_umap.Rmd b/03-rnaseq/dimension-reduction_rnaseq_02_umap.Rmd index 0c7699bf..0439dc2d 100644 --- a/03-rnaseq/dimension-reduction_rnaseq_02_umap.Rmd +++ b/03-rnaseq/dimension-reduction_rnaseq_02_umap.Rmd @@ -262,7 +262,7 @@ metadata <- metadata %>% ## Create a DESeqDataset We will be using the `DESeq2` package for normalization, which requires us to format our data into a `DESeqDataSet` object. -We turn the data frame (or matrix) into a [`DESeqDataSet` object](TODO: Add link to header section) and specify which variable labels our experimental groups using the [`design` argument](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#multi-factor-designs) [@Love2014]. +We turn the data frame (or matrix) into a [`DESeqDataSet` object](https://alexslemonade.github.io/refinebio-examples/03-rnaseq/00-intro-to-rnaseq.html#02_About_DESeq2). ) and specify which variable labels our experimental groups using the [`design` argument](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#multi-factor-designs) [@Love2014]. In this chunk of code, we will not provide a specific model to the `design` argument because we are not performing a differential expression analysis. ```{r} diff --git a/03-rnaseq/dimension-reduction_rnaseq_02_umap.html b/03-rnaseq/dimension-reduction_rnaseq_02_umap.html index cd43504e..243d455f 100644 --- a/03-rnaseq/dimension-reduction_rnaseq_02_umap.html +++ b/03-rnaseq/dimension-reduction_rnaseq_02_umap.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1833,7 +1838,6 @@

    4.1 Install libraries

    ## The following objects are masked from 'package:Biobase':
     ## 
     ##     anyMissing, rowMedians
    -
    ## Loading required package: BiocParallel
    ## 
     ## Attaching package: 'DelayedArray'
    ## The following objects are masked from 'package:matrixStats':
    @@ -1926,7 +1930,7 @@ 

    4.4 Prepare metadata for DE

    4.5 Create a DESeqDataset

    -

    We will be using the DESeq2 package for normalization, which requires us to format our data into a DESeqDataSet object. We turn the data frame (or matrix) into a DESeqDataSet object and specify which variable labels our experimental groups using the design argument (Love et al. 2014). In this chunk of code, we will not provide a specific model to the design argument because we are not performing a differential expression analysis.

    +

    We will be using the DESeq2 package for normalization, which requires us to format our data into a DESeqDataSet object. We turn the data frame (or matrix) into a DESeqDataSet object. ) and specify which variable labels our experimental groups using the design argument (Love et al. 2014). In this chunk of code, we will not provide a specific model to the design argument because we are not performing a differential expression analysis.

    # Create a `DESeqDataSet` object
     dds <- DESeqDataSetFromMatrix(
       countData = df, # This is the data frame with the counts values for all replicates in our dataset
    @@ -1980,7 +1984,7 @@ 

    4.10 Create UMAP plot

    ) ) + geom_point() # This tells R that we want a scatterplot
    -

    +

    Let’s try adding a variable to our plot for annotation.

    In this code chunk, the variable refinebio_treatment is given to the ggplot() function so we can label by androgen deprivation therapy (ADT) status.

    # Plot using `ggplot()` function
    @@ -1993,7 +1997,7 @@ 

    4.10 Create UMAP plot

    ) ) + geom_point() # This tells R that we want a scatterplot
    -

    +

    In the next code chunk, we are going to add another variable to our plot for annotation.

    We’ll plot using both refinebio_treatment and refinebio_disease variables for labels since they are central to the androgen deprivation therapy (ADT) based hypothesis in the original paper (Sharma et al. 2018).

    # Plot using `ggplot()` function
    @@ -2010,7 +2014,7 @@ 

    4.10 Create UMAP plot

    # Display the plot that we saved above final_annotated_umap_plot
    -

    +

    Although it does appear that majority of the pre-ADT and post-ADT appear to cluster together, there are still questions remaining as we look at outliers.

    4.10.1 Interpretation of UMAP plot and results

    @@ -2050,61 +2054,61 @@

    6 Print session info

    At the end of every analysis, before saving your notebook, we recommend printing out your session info. This helps make your code more reproducible by recording what versions of softwares and packages you used to run this.

    # Print session info
     sessionInfo()
    -
    ## R version 3.6.1 (2019-07-05)
    -## Platform: x86_64-apple-darwin15.6.0 (64-bit)
    -## Running under: macOS Mojave 10.14.6
    +
    ## R version 4.0.2 (2020-06-22)
    +## Platform: x86_64-pc-linux-gnu (64-bit)
    +## Running under: Ubuntu 20.04 LTS
     ## 
     ## Matrix products: default
    -## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
    -## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
    +## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-openmp/libopenblasp-r0.3.8.so
     ## 
     ## locale:
    -## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
    +##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    +##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    +##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    +##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    +##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
    +## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     ## 
     ## attached base packages:
     ## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
     ## [8] methods   base     
     ## 
     ## other attached packages:
    -##  [1] magrittr_1.5                ggplot2_3.2.1              
    -##  [3] umap_0.2.3                  DESeq2_1.26.0              
    -##  [5] SummarizedExperiment_1.15.6 DelayedArray_0.11.4        
    -##  [7] BiocParallel_1.19.2         matrixStats_0.54.0         
    -##  [9] Biobase_2.45.0              GenomicRanges_1.37.14      
    -## [11] GenomeInfoDb_1.21.1         IRanges_2.19.10            
    -## [13] S4Vectors_0.23.23           BiocGenerics_0.31.5        
    -## [15] optparse_1.6.2             
    +##  [1] magrittr_1.5                ggplot2_3.3.2              
    +##  [3] umap_0.2.6.0                DESeq2_1.28.1              
    +##  [5] SummarizedExperiment_1.18.2 DelayedArray_0.14.1        
    +##  [7] matrixStats_0.56.0          Biobase_2.48.0             
    +##  [9] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
    +## [11] IRanges_2.22.2              S4Vectors_0.26.1           
    +## [13] BiocGenerics_0.34.0         optparse_1.6.6             
     ## 
     ## loaded via a namespace (and not attached):
    -##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
    -##  [4] tools_3.6.1            backports_1.1.7        R6_2.4.1              
    -##  [7] rpart_4.1-15           Hmisc_4.2-0            DBI_1.0.0             
    -## [10] lazyeval_0.2.2         colorspace_1.4-1       nnet_7.3-12           
    -## [13] withr_2.2.0            tidyselect_0.2.5       gridExtra_2.3         
    -## [16] bit_1.1-14             compiler_3.6.1         cli_2.0.2             
    -## [19] htmlTable_1.13.1       labeling_0.3           scales_1.0.0          
    -## [22] checkmate_1.9.4        readr_1.3.1            genefilter_1.67.1     
    -## [25] askpass_1.1            stringr_1.4.0          digest_0.6.25         
    -## [28] foreign_0.8-72         rmarkdown_1.14         XVector_0.25.0        
    -## [31] base64enc_0.1-3        pkgconfig_2.0.3        htmltools_0.3.6       
    -## [34] styler_1.1.1           htmlwidgets_1.3        rlang_0.4.6           
    -## [37] rstudioapi_0.11        RSQLite_2.1.2          jsonlite_1.7.0        
    -## [40] acepack_1.4.1          dplyr_0.8.3            RCurl_1.95-4.12       
    -## [43] GenomeInfoDbData_1.2.1 Formula_1.2-3          Matrix_1.2-17         
    -## [46] Rcpp_1.0.4.6           munsell_0.5.0          fansi_0.4.1           
    -## [49] reticulate_1.13        lifecycle_0.2.0        stringi_1.4.6         
    -## [52] yaml_2.2.1             zlibbioc_1.31.0        grid_3.6.1            
    -## [55] blob_1.2.0             crayon_1.3.4           lattice_0.20-38       
    -## [58] splines_3.6.1          annotate_1.63.0        hms_0.5.3             
    -## [61] locfit_1.5-9.1         knitr_1.28             pillar_1.4.4          
    -## [64] geneplotter_1.63.0     XML_3.98-1.20          glue_1.4.1            
    -## [67] evaluate_0.14          latticeExtra_0.6-28    data.table_1.12.2     
    -## [70] vctrs_0.3.1            openssl_1.4.1          gtable_0.3.0          
    -## [73] getopt_1.20.3          purrr_0.3.4            rematch2_2.1.0        
    -## [76] assertthat_0.2.1       xfun_0.14              xtable_1.8-4          
    -## [79] RSpectra_0.15-0        survival_2.44-1.1      tibble_3.0.1          
    -## [82] AnnotationDbi_1.47.0   memoise_1.1.0          cluster_2.1.0         
    -## [85] ellipsis_0.3.1
    +## [1] bitops_1.0-6 bit64_0.9-7.1 RColorBrewer_1.1-2 +## [4] R.cache_0.14.0 tools_4.0.2 backports_1.1.9 +## [7] R6_2.4.1 DBI_1.1.0 colorspace_1.4-1 +## [10] withr_2.2.0 tidyselect_1.1.0 bit_1.1-15.2 +## [13] compiler_4.0.2 cli_2.0.2 labeling_0.3 +## [16] scales_1.1.1 readr_1.3.1 genefilter_1.70.0 +## [19] askpass_1.1 stringr_1.4.0 digest_0.6.25 +## [22] rmarkdown_2.3 R.utils_2.9.2 XVector_0.28.0 +## [25] pkgconfig_2.0.3 htmltools_0.5.0 styler_1.3.2 +## [28] rlang_0.4.7 rstudioapi_0.11 RSQLite_2.2.0 +## [31] farver_2.0.3 generics_0.0.2 jsonlite_1.7.0 +## [34] BiocParallel_1.22.0 dplyr_1.0.0 R.oo_1.23.0 +## [37] RCurl_1.98-1.2 GenomeInfoDbData_1.2.3 Matrix_1.2-18 +## [40] Rcpp_1.0.5 munsell_0.5.0 fansi_0.4.1 +## [43] reticulate_1.16 lifecycle_0.2.0 R.methodsS3_1.8.0 +## [46] stringi_1.4.6 yaml_2.2.1 zlibbioc_1.34.0 +## [49] grid_4.0.2 blob_1.2.1 crayon_1.3.4 +## [52] lattice_0.20-41 splines_4.0.2 annotate_1.66.0 +## [55] hms_0.5.3 locfit_1.5-9.4 knitr_1.29 +## [58] pillar_1.4.6 geneplotter_1.66.0 XML_3.99-0.5 +## [61] glue_1.4.1 evaluate_0.14 vctrs_0.3.2 +## [64] gtable_0.3.0 openssl_1.4.2 getopt_1.20.3 +## [67] purrr_0.3.4 rematch2_2.1.2 assertthat_0.2.1 +## [70] xfun_0.16 xtable_1.8-4 RSpectra_0.16-0 +## [73] survival_3.1-12 tibble_3.0.3 AnnotationDbi_1.50.3 +## [76] memoise_1.1.0 ellipsis_0.3.1

    CCDL, 2020 ScRNA-seq dimension reduction.

    @@ -2186,7 +2190,7 @@

    6 Print session info

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/04-advanced-topics/00-intro-to-advanced-topics.html b/04-advanced-topics/00-intro-to-advanced-topics.html index 4fad9f01..29ea325c 100644 --- a/04-advanced-topics/00-intro-to-advanced-topics.html +++ b/04-advanced-topics/00-intro-to-advanced-topics.html @@ -1,11 +1,10 @@ - + - @@ -1344,7 +1343,6 @@ } img { max-width:100%; - height: auto; } .tabbed-pane { padding-top: 12px; @@ -1493,6 +1491,7 @@ border: none; display: inline-block; border-radius: 4px; + background-color: transparent; } .tabset-dropdown > .nav-tabs.nav-tabs-open > li { @@ -1521,6 +1520,12 @@ } } +@media print { +.toc-content { + /* see https://github.com/w3c/csswg-drafts/issues/4434 */ + float: right; +} +} .toc-content { padding-left: 30px; @@ -1715,7 +1720,7 @@

    CCDL for ALSF

    theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { - return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); + return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_'); }, ignoreSelector: ".toc-ignore", scrollTo: 0 diff --git a/components/figures/Love2016-fig1.png b/components/figures/Love2016-fig1.png new file mode 100644 index 00000000..a6e6ece1 Binary files /dev/null and b/components/figures/Love2016-fig1.png differ diff --git a/references.bib b/references.bib index f956d6b1..cab701f7 100644 --- a/references.bib +++ b/references.bib @@ -5,6 +5,14 @@ @Website{annotation-packages url = {https://bioconductor.org/packages/release/BiocViews.html#___AnnotationData}, } +@Website{bias-blog, + title = {RNA-seq fragment sequence bias}, + author = {Michael I. Love}, + month={September}, + year = {2016}, + url = {https://mikelove.wordpress.com/2016/09/26/rna-seq-fragment-sequence-bias/}, +} + @Manual{Blighe2020, title = {EnhancedVolcano: Publication-ready volcano plots with enhanced colouring and labeling}, @@ -56,6 +64,12 @@ @Manual{CCDL2020-cluster-validation url = {https://github.com/AlexsLemonade/training-modules/blob/3dbc6f3f53c680ec6aa2f513851c1cd4635cc31c/machine-learning/02-openpbta_consensus_clustering.Rmd#L310}, } +@Website{count-based, + title = {Count Based RNA-seq analysis}, + Author={{Kasper D. Hansen}}, + url = {https://kasperdanielhansen.github.io/genbioconductor/html/Count_Based_RNAseq.html}, +} + @Website{Freytag2019, title = {Workshop: Dimension reduction with R}, author = {Saskia Freytag}, @@ -91,6 +105,37 @@ @Manual{Gu2020 url = {https://jokergoo.github.io/ComplexHeatmap-reference/book/}, } +@Website{Hadfield2016, + title = {An Introduction to RNA-seq}, + author = {{James Hadfield}}, + month = {July}, + year = {2016}, + url = {https://bitesizebio.com/13542/what-everyone-should-know-about-rna-seq/}, +} + +@Article{Hansen2010, + Author="Hansen, K. D. and Brenner, S. E. and Dudoit, S. ", + Title="{{B}iases in {I}llumina transcriptome sequencing caused by random hexamer priming}", + Journal="Nucleic Acids Res.", + Year="2010", + Volume="38", + Number="12", + Pages="e131", + Month="Jul" +} + +@Website{dge-workshop-count-normalization, + title = {Introduction to DGE - Count Normalization}, + author = {{Harvard Chan Bioinformatics Core (HBC)}}, + url = {https://hbctraining.github.io/DGE_workshop_salmon/lessons/02_DGE_count_normalization.html}, +} + +@Website{dge-workshop-deseq2, + title = {Introduction to DGE - DESeq2 Analysis}, + author = {{Harvard Chan Bioinformatics Core (HBC)}}, + url = {https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html}, +} + @Article{Huber2015, Author="Huber, W. and Carey, V. J. and Gentleman, R. and Anders, S. and Carlson, M. and Carvalho, B. S. and Bravo, H. C. and Davis, S. and Gatto, L. and Girke, T. and Gottardo, R. and Hahne, F. and Hansen, K. D. and Irizarry, R. A. and Lawrence, M. and Love, M. I. and MacDonald, J. and Obenchain, V. and Ole?, A. K. and Pag?s, H. and Reyes, A. and Shannon, P. and Smyth, G. K. and Tenenbaum, D. and Waldron, L. and Morgan, M. ", Title="{{O}rchestrating high-throughput genomic analysis with {B}ioconductor}", @@ -127,6 +172,40 @@ @Article{Love2014 doi = {10.1186/s13059-014-0550-8}, } +@Article{Love2014-guide, + title = {Beginner’s guide to using the DESeq2 package}, + author = {Michael I. Love, Simon Anders, and Wolfgang Huber}, + year = {2014}, + url = {https://bioc.ism.ac.jp/packages/2.14/bioc/vignettes/DESeq2/inst/doc/beginner.pdf}, +} + +@Article{Love2016, + Author="Love, M. I. and Hogenesch, J. B. and Irizarry, R. A. ", + Title="{{M}odeling of {R}{N}{A}-seq fragment sequence bias reduces systematic errors in transcript abundance estimation}", + Journal="Nat. Biotechnol.", + Year="2016", + Volume="34", + Number="12", + Pages="1287--1291", + Month="Dec" +} + +@Website{Love2019, + title = {RNA-seq workflow: gene-level exploratory analysis and differential expression}, + author = {Michael I. Love, Simon Anders, Vladislav Kim and Wolfgang Huber}, + month={October}, + year = {2019}, + url = {http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#the-variance-stabilizing-transformation-and-the-rlog}, +} + +@Website{Love2020, + title = {Analyzing RNA-seq data with DESeq2}, + author = {{Michael I. Love, Simon Anders, and Wolfgang Huber}}, + month = {May}, + year = {2020}, + url = {https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html}, +} + @Article{Nguyen2019, title = {Ten quick tips for effective dimensionality reduction}, author = {Lan Huong Nguyen and Susan Holmes}, @@ -144,6 +223,17 @@ @Website{pca-visually-explained url = {https://setosa.io/ev/principal-component-analysis/}, } +@Article{Pepke2009, + Author="Pepke, S. and Wold, B. and Mortazavi, A. ", + Title="{{C}omputation for {C}h{I}{P}-seq and {R}{N}{A}-seq studies}", + Journal="Nat. Methods", + Year="2009", + Volume="6", + Number="11 Suppl", + Pages="22--32", + Month="Nov" +} + @Manual{Prabhakaran2016, title = {The Complete ggplot2 Tutorial}, author = {Selva Prabhakaran}, @@ -298,6 +388,40 @@ @Article{Slowikowski2017 url = {https://slowkow.com/notes/pheatmap-tutorial/}, } +@Article{Soneson2015, + title = {Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences}, + author = {Charlotte Soneson and Michael I. Love and Mark D. Robinson}, + year = {2015}, + journal = {F1000Research}, + doi = {10.12688/f1000research.7563.1}, + volume = {4}, + issue = {1521}, + } + +@Website{Starmer2015, + title = {RPKM, FPKM and TPM, Clearly Explained!!!}, + author = {{Josh Starmer}}, + month = {July}, + year = {2015}, + url = {https://www.youtube.com/watch?v=TTUrtCY2k-w}, +} + +@Website{Starmer2017-deseq2, + title = {StatQuest: DESeq2, part 1, Library Normalization}, + author = {{Josh Starmer}}, + month = {March}, + year = {2017}, + url = {https://www.youtube.com/watch?v=UFB993xufUU}, +} + +@Website{Starmer2017-rnaseq, + title = {StatQuest: A gentle introduction to RNA-seq}, + author = {{Josh Starmer}}, + month = {August}, + year = {2017}, + url = {https://www.youtube.com/watch?v=tlf6wYJrwKY}, +} + @Article{Tregnago2016, Author="Tregnago, C. and Manara, E. and Zampini, M. and Bisio, V. and Borga, C. and Bresolin, S. and Aveic, S. and Germano, G. and Basso, G. and Pigazzi, M. ", Title="{{C}{R}{E}{B} engages {C}/{E}{B}{P}δ to initiate leukemogenesis}", @@ -335,6 +459,45 @@ @Manual{Wickham2020 url = {https://CRAN.R-project.org/package=devtools}, } +@Article{Zhang2017, + Author="Zhang, C. and Zhang, B. and Lin, L. L. and Zhao, S. ", + Title="{{E}valuation and comparison of computational tools for {R}{N}{A}-seq isoform quantification}", + Journal="BMC Genomics", + Year="2017", + Volume="18", + Number="1", + Pages="583", + Month="08" +} + +@article{Zhong2009, + Author = {Wang, Zhong and Gerstein, Mark and Snyder, Michael}, + Date = {2009/01/}, + Date-Added = {2020-09-17 13:30:50 +0000}, + Date-Modified = {2020-09-17 13:30:50 +0000}, + Db = {PubMed}, + Doi = {10.1038/nrg2484}, + Isbn = {1471-0064; 1471-0056}, + J2 = {Nat Rev Genet}, + Journal = {Nature reviews. Genetics}, + Keywords = {Animals; Base Sequence; Chromosome Mapping; Exons; Gene Expression Profiling/*methods; Humans; Models, Genetic; Molecular Sequence Data; RNA/*analysis; Sequence Analysis, RNA/*methods; *Transcription, Genetic}, + L2 = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2949280/}, + La = {eng}, + Month = {01}, + Number = {1}, + Pages = {57--63}, + Title = {RNA-Seq: a revolutionary tool for transcriptomics}, + Ty = {JOUR}, + U1 = {19015660{$[$}pmid{$]$}}, + U2 = {PMC2949280{$[$}pmcid{$]$}}, + U4 = {nrg2484{$[$}PII{$]$}}, + Url = {https://pubmed.ncbi.nlm.nih.gov/19015660}, + Volume = {10}, + Year = {2009}, + Bdsk-Url-1 = {https://pubmed.ncbi.nlm.nih.gov/19015660}, + Bdsk-Url-2 = {https://doi.org/10.1038/nrg2484}} +} + @Article{Zhu2018, title = {Heavy-tailed prior distributions for sequence count data: removing the noise and preserving large differences}, author = {Anqi Zhu and Joseph G. Ibrahim and Michael I. Love},