1. Environment setting

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(bsseq)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(sechm)
    
source("../Methylation_helper.R")
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
InputFolder <- params$InputFolder

2. Loading data

Loading bsseq object

bsseq_obj_escapees <- readRDS(paste0(InputFolder, "bsseq_obj_escapees.rds"))

Loading annotated escapees

AnnotatedEscapees <- readRDS("~/DataDir/7.EscapeesExploration/Output/EscapeesAnnotationAndFunctional/AnnotatedEscapees.rds")

3. Heatmap

I keep escapee CpGs covered by ALL samples

bsseq_obj_escapees_filt <- methylSig::filter_loci_by_group_coverage(bs = bsseq_obj_escapees, group_column = "Line", min_samples_per_group = c("CTL08A" = dim(bsseq_obj_escapees)[2]))

Now we have 8388 CpGs.

Add annotation made in script 2.EscapeeAnnotationAndFunctional.Rmd

AnnotatedEscapees_gr <- makeGRangesFromDataFrame(AnnotatedEscapees, keep.extra.columns = TRUE)
m <- findOverlaps(rowRanges(bsseq_obj_escapees_filt), AnnotatedEscapees_gr)

bsseq_obj_subset <- bsseq_obj_escapees_filt[queryHits(m),]

rowData(bsseq_obj_subset)$ensembl_gene_id <- AnnotatedEscapees_gr[subjectHits(m), ]$ensembl_gene_id

Let’s show the average methylation levels. Instead of plotting for each row an escapee CpGs, I aggregate CpGs by ensembl gene id averaging the methylation levels.

AvgMeth <- aggregate(assays(bsseq_obj_subset)$Meth, list(rowData(bsseq_obj_subset)$ensembl_gene_id), mean, na.rm = T)
rownames(AvgMeth) <- AvgMeth$Group.1
AvgMeth <- AvgMeth[, -1]

rowDataSE <- rowData(bsseq_obj_subset)
rownames(rowDataSE) <- rowData(bsseq_obj_subset)$ensembl_gene_id
rowDataSE <- rowDataSE[rownames(AvgMeth), ]

SampleAnno <- colData(bsseq_obj_subset)

SE <- SummarizedExperiment(assays=AvgMeth, colData=SampleAnno, rowData = rowDataSE)

identical(colnames(assay(SE)), rownames(colData(SE))) #TRUE
## [1] TRUE

cellTypes_colors <- c(hiPSCs ="#dd1c77", iMeLCs ="#377eb8", hPGCLCs ="#4daf4a", hEGCLCs = "#ff7f00")
colData(SE)$Type <- factor(colData(SE)$Type, levels = c("hiPSCs", "iMeLCs" , "hPGCLCs", "hEGCLCs"))

metadata(SE)$anno_colors <- list(Type = cellTypes_colors,
                                               GenomicRegion = c ("3' UTR" = "#FFADAD", "5' UTR" = "#FFD6A5", "Distal Intergenic" = "#FDFFB6", "Downstream (<=300bp)" = "#FFFFFC", "Exon" = "#9BF6FF", "Intron" = "#A0C4FF", "Promoter (<=1kb)" = "#BDB2FF", "Promoter (1-2kb)" = "#FFC6FF", "Promoter (2-3kb)" = "#CAFFBF"))
break_coord <- assay(SE) %>% quantile(seq(0,1, by=0.1), na.rm =TRUE)
ExpCols <- viridisLite::plasma(length(seq(head(break_coord,1), tail(break_coord,1), by=0.2)))
sechm(SE, features = rownames(SE), hmcols=ExpCols, breaks=seq(head(break_coord,1), tail(break_coord,1), by=0.2), show_rownames=FALSE, show_colnames=TRUE, do.scale=FALSE, column_title = "Average methylation levels (by gene) of CpGs in escapee regions", top_annotation = c("Type"), row_title_rot = 0, cluster_rows = TRUE)

Let’s change the range of methylation

break_coord <- assay(SE) %>% quantile(seq(0,1, by=0.1), na.rm =TRUE)
ExpCols <- viridisLite::plasma(length(seq(break_coord[2], tail(break_coord,1), by=0.05)))
sechm(SE, features = rownames(SE), hmcols=ExpCols, breaks=seq(break_coord[2], tail(break_coord,1), by=0.05), show_rownames=FALSE, show_colnames=TRUE, do.scale=FALSE, column_title = "Average methylation levels (by gene) of CpGs in escapee regions", top_annotation = c("Type"), row_title_rot = 0, cluster_rows = TRUE)

Let’s now show the median methylation levels. Instead of plotting for each row an escapee CpGs, I aggregate CpGs by ensembl gene id doing the median of the methylation levels.

MedianMeth <- aggregate(assays(bsseq_obj_subset)$Meth, list(rowData(bsseq_obj_subset)$ensembl_gene_id), median, na.rm = T)
rownames(MedianMeth) <- MedianMeth$Group.1
MedianMeth <- MedianMeth[, -1]

rowDataSE <- rowData(bsseq_obj_subset)
rownames(rowDataSE) <- rowData(bsseq_obj_subset)$ensembl_gene_id
rowDataSE <- rowDataSE[rownames(MedianMeth), ]

SampleAnno <- colData(bsseq_obj_subset)

SE <- SummarizedExperiment(assays=MedianMeth, colData=SampleAnno, rowData = rowDataSE)

identical(colnames(assay(SE)), rownames(colData(SE))) #TRUE
## [1] TRUE

cellTypes_colors <- c(hiPSCs ="#dd1c77", iMeLCs ="#377eb8", hPGCLCs ="#4daf4a", hEGCLCs = "#ff7f00")
colData(SE)$Type <- factor(colData(SE)$Type, levels = c("hiPSCs", "iMeLCs" , "hPGCLCs", "hEGCLCs"))

metadata(SE)$anno_colors <- list(Type = cellTypes_colors,
                                               GenomicRegion = c ("3' UTR" = "#FFADAD", "5' UTR" = "#FFD6A5", "Distal Intergenic" = "#FDFFB6", "Downstream (<=300bp)" = "#FFFFFC", "Exon" = "#9BF6FF", "Intron" = "#A0C4FF", "Promoter (<=1kb)" = "#BDB2FF", "Promoter (1-2kb)" = "#FFC6FF", "Promoter (2-3kb)" = "#CAFFBF"))
break_coord <- assay(SE) %>% quantile(seq(0,1, by=0.1), na.rm =TRUE)
ExpCols <- viridisLite::plasma(length(seq(head(break_coord,1), tail(break_coord,1), by=0.2)))
sechm(SE, features = rownames(SE), hmcols=ExpCols, breaks=seq(head(break_coord,1), tail(break_coord,1), by=0.2), show_rownames=FALSE, show_colnames=TRUE, do.scale=FALSE, column_title = "Median methylation levels (by gene) of CpGs in escapee regions", top_annotation = c("Type"), row_title_rot = 0, cluster_rows = TRUE)

Let’s change the range of methylation (1st-10th deciles of methylation)

break_coord <- assay(SE) %>% quantile(seq(0,1, by=0.1), na.rm =TRUE)
ExpCols <- viridisLite::plasma(length(seq(break_coord[2], tail(break_coord,1), by=0.05)))
sechm(SE, features = rownames(SE), hmcols=ExpCols, breaks=seq(break_coord[2], tail(break_coord,1), by=0.05), show_rownames=FALSE, show_colnames=TRUE, do.scale=FALSE, column_title = "Median methylation levels (by gene) of CpGs in escapee regions", top_annotation = c("Type"), row_title_rot = 0, cluster_rows = TRUE)

4. Session Info

date()
## [1] "Tue Jan 28 16:36:07 2025"
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyr_1.3.0                 sechm_1.6.0                
##  [3] bsseq_1.34.0                SummarizedExperiment_1.28.0
##  [5] Biobase_2.58.0              MatrixGenerics_1.10.0      
##  [7] matrixStats_1.0.0           GenomicRanges_1.50.2       
##  [9] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [11] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [13] gridExtra_2.3               scales_1.2.1               
## [15] ggplot2_3.4.2               dplyr_1.1.2                
## 
## loaded via a namespace (and not attached):
##   [1] DSS_2.46.0                bitops_1.0-7             
##   [3] doParallel_1.0.17         RColorBrewer_1.1-3       
##   [5] tools_4.2.1               bslib_0.5.0              
##   [7] utf8_1.2.3                R6_2.5.1                 
##   [9] HDF5Array_1.26.0          colorspace_2.1-0         
##  [11] permute_0.9-7             rhdf5filters_1.10.1      
##  [13] GetoptLong_1.0.5          withr_2.5.0              
##  [15] tidyselect_1.2.0          curl_5.0.1               
##  [17] compiler_4.2.1            cli_3.6.1                
##  [19] Cairo_1.6-0               methylSig_1.10.0         
##  [21] TSP_1.2-4                 DelayedArray_0.24.0      
##  [23] rtracklayer_1.58.0        sass_0.4.7               
##  [25] stringr_1.5.0             digest_0.6.33            
##  [27] Rsamtools_2.14.0          rmarkdown_2.23           
##  [29] R.utils_2.12.2            ca_0.71.1                
##  [31] XVector_0.38.0            pkgconfig_2.0.3          
##  [33] htmltools_0.5.5           sparseMatrixStats_1.10.0 
##  [35] highr_0.10                fastmap_1.1.1            
##  [37] limma_3.54.2              BSgenome_1.66.3          
##  [39] rlang_1.1.1               GlobalOptions_0.1.2      
##  [41] rstudioapi_0.15.0         DelayedMatrixStats_1.20.0
##  [43] shape_1.4.6               jquerylib_0.1.4          
##  [45] BiocIO_1.8.0              generics_0.1.3           
##  [47] jsonlite_1.8.7            BiocParallel_1.32.6      
##  [49] gtools_3.9.4              R.oo_1.25.0              
##  [51] RCurl_1.98-1.12           magrittr_2.0.3           
##  [53] GenomeInfoDbData_1.2.9    Matrix_1.6-0             
##  [55] Rcpp_1.0.11               munsell_0.5.0            
##  [57] Rhdf5lib_1.20.0           fansi_1.0.4              
##  [59] lifecycle_1.0.3           R.methodsS3_1.8.2        
##  [61] stringi_1.7.12            yaml_2.3.7               
##  [63] zlibbioc_1.44.0           rhdf5_2.42.1             
##  [65] Rtsne_0.16                grid_4.2.1               
##  [67] parallel_4.2.1            randomcoloR_1.1.0.1      
##  [69] crayon_1.5.2              lattice_0.21-8           
##  [71] splines_4.2.1             Biostrings_2.66.0        
##  [73] circlize_0.4.15           locfit_1.5-9.7           
##  [75] knitr_1.43                ComplexHeatmap_2.14.0    
##  [77] pillar_1.9.0              rjson_0.2.21             
##  [79] codetools_0.2-19          XML_3.99-0.14            
##  [81] glue_1.6.2                evaluate_0.21            
##  [83] V8_4.3.0                  data.table_1.14.8        
##  [85] vctrs_0.6.3               png_0.1-8                
##  [87] foreach_1.5.2             purrr_1.0.1              
##  [89] gtable_0.3.3              clue_0.3-64              
##  [91] cachem_1.0.8              xfun_0.39                
##  [93] restfulr_0.0.15           viridisLite_0.4.2        
##  [95] seriation_1.4.2           tibble_3.2.1             
##  [97] iterators_1.0.14          registry_0.5-1           
##  [99] GenomicAlignments_1.34.1  cluster_2.1.4