1. Environment setting

library(ggplot2)
library(bsseq)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(scales)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:bsseq':
## 
##     combine
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:bsseq':
## 
##     combine
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dmrseq)
library(annotatr)
library(sechm)
library(rtracklayer)
source("../Methylation_helper.R")
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand

OutputFolder <- params$OutputFolder

2. Loading data

2.1 BSSeq object

Loading of bsseq object with CpGs shared by 75% of the samples.

bsseq_obj <- readRDS("~/DataDir/1.PreliminaryAnalysis/Output/WithSelectedEGCLCs/bsseq_obj_sharedby75ofall.rds") #getting bsseq object where sample selection and CpG filtering were already performed

2.3 Escapees regions

These escapees regions were filtered from the regions found in Tang et al. 2015. In particular, those covered by at least 10 CpGs in all (Tang et al. 2015) human primordial germ cell (hPGC) samples and with methylation levels above 30% in (Tang et al. 2015) week 5-9 hPGC samples were selected.

EscapeesRegions <- readxl::read_xlsx("~/DataDir/7.EscapeesExploration/Output/EscapeesFiltering/FirstFilteringRegions.xlsx")

EscapeesRegions_GRanges_hg19 <- makeGRangesFromDataFrame(EscapeesRegions, seqnames.field = "Chr", start.field = "Start", end.field = "End", keep.extra.columns = FALSE, na.rm=TRUE)

We have 9856 escapees regions with a total of 735308 CpGs.

These regions are hg19 and need to be converted to hg38 to be compared with our data

path = "~/DataDir/7.EscapeesExploration/hg19ToHg38.over.chain"
ch = import.chain(path)
str(ch[[1]])
## Formal class 'ChainBlock' [package "rtracklayer"] with 6 slots
##   ..@ ranges  :Formal class 'IRanges' [package "IRanges"] with 6 slots
##   .. .. ..@ start          : int [1:1239] 16847851 16867628 16867672 16867743 16867854 16876217 16876254 16876567 16876586 16876682 ...
##   .. .. ..@ width          : int [1:1239] 19737 43 47 96 8362 36 312 18 95 33 ...
##   .. .. ..@ NAMES          : NULL
##   .. .. ..@ elementType    : chr "ANY"
##   .. .. ..@ elementMetadata: NULL
##   .. .. ..@ metadata       : list()
##   ..@ offset  : int [1:1239] 480662 480702 480702 480726 480726 480726 480726 480726 480726 480726 ...
##   ..@ score   : int [1:16] -1063871317 61356972 1903041 1027915 538227 6814 4748 4372 4314 3439 ...
##   ..@ space   : chr [1:16] "chr22" "chr22" "chr22" "chr22" ...
##   ..@ reversed: logi [1:16] FALSE TRUE FALSE FALSE TRUE FALSE ...
##   ..@ length  : int [1:16] 1129 18 2 26 50 2 1 1 2 1 ...

if ("UCSC" %in% seqlevelsStyle(EscapeesRegions_GRanges_hg19)) {
  EscapeesRegions_GRanges_hg38 = liftOver(EscapeesRegions_GRanges_hg19, ch)
  EscapeesRegions_GRanges_hg38 = unlist(EscapeesRegions_GRanges_hg38)  
}else{stop("The seqlevels style should be UCSC!!!")}

length(EscapeesRegions_GRanges_hg38)
## [1] 10191

After the conversion we have 10191 escapees regions.

EscapeesRegions_GRanges <- EscapeesRegions_GRanges_hg38

3. Generation of BSSeq oject with escapee CpGs and non-escapee CpGs

bsseq_ranges_df <- granges(bsseq_obj) %>% as.data.frame() #granges has 0-based position they seem 1-based. Look here for explanation: https://www.biostars.org/p/84686/
bsseq_ranges_df$end <- bsseq_ranges_df$start + 1 #to make them 0-based
bsseq_granges <- makeGRangesFromDataFrame(bsseq_ranges_df, starts.in.df.are.0based = TRUE)

bsseq_obj_1based <- bsseq_obj
bsseq_obj_1based@rowRanges <- bsseq_granges
granges(bsseq_obj_1based)
## GRanges object with 3680616 ranges and 0 metadata columns:
##             seqnames    ranges strand
##                <Rle> <IRanges>  <Rle>
##         [1]     chr1     10469      *
##         [2]     chr1     10471      *
##         [3]     chr1     10484      *
##         [4]     chr1     10489      *
##         [5]     chr1     10493      *
##         ...      ...       ...    ...
##   [3680612]     chrM     11647      *
##   [3680613]     chrM     11689      *
##   [3680614]     chrM     11692      *
##   [3680615]     chrM     11710      *
##   [3680616]     chrM     11716      *
##   -------
##   seqinfo: 25 sequences from an unspecified genome; no seqlengths

This to obtain a bsseq object with only CpGs contained in the escapees regions

m <- findOverlaps(bsseq_granges, EscapeesRegions_GRanges)

bsseq_obj_escapees <- bsseq_obj_1based[queryHits(m),]

With our targeted kit we are able to cover 9113 CpGs belonging to all the escapees regions (1.24%).

Let’s create also a bsseq object with non-escapee CpGs (all the other CpGs that are not in escapees regions)

bsseq_obj_NonEscapees <- bsseq_obj_1based[-queryHits(m),]

4. Violin plot showing methylation levels of escapee CpGs

assays(bsseq_obj_escapees)$Meth <- getMeth(bsseq_obj_escapees, type = "raw")
assays(bsseq_obj_NonEscapees)$Meth <- getMeth(bsseq_obj_NonEscapees, type = "raw")

sample_colors <- c("hiPSC_rep1" = "#dd1c77", "hiPSC_rep2" = "#dd1c77", "hiPSC_rep3" = "#dd1c77", "hiPSC_rep4" = "#dd1c77",
                   "iMeLC_rep1" = "#377eb8", "iMeLC_rep2" = "#377eb8", "iMeLC_rep3" = "#377eb8", "iMeLC_rep4" = "#377eb8",
                   "hPGCLC_rep1" = "#4daf4a", "hPGCLC_rep2" = "#4daf4a", "hPGCLC_rep3" = "#4daf4a",
                   "hEGCLC_rep1" = "#ff7f00", "hEGCLC_rep2" = "#ff7f00", "hEGCLC_rep3" = "#ff7f00")

violin_plot(assays(bsseq_obj_escapees)$Meth, stat="median") + ggplot2::scale_fill_manual(values = sample_colors)
## No id variables; using all as measure variables

5. Saving and Session Info

Saving:

DataFrame of Escapee regions after filtering and conversion to hg38 (excel format)
bsseq object subset with only CpGs in escapees regions where hPGCs have meth > 0.3
bsseq object subset with non-escapee CpGs (all the other CpGs that are not in escapees regions)

openxlsx::write.xlsx(as.data.frame(EscapeesRegions_GRanges) %>% rename(chr = seqnames), file = paste0(OutputFolder, "EscapeeRegions_hg38.xlsx")) 
saveRDS(bsseq_obj_escapees, paste0(OutputFolder, "bsseq_obj_escapees.rds"))
saveRDS(bsseq_obj_NonEscapees, paste0(OutputFolder, "bsseq_obj_NonEscapees.rds"))

date()
## [1] "Tue Jan 28 15:54:30 2025"
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyr_1.3.0                 rtracklayer_1.58.0         
##  [3] sechm_1.6.0                 annotatr_1.24.0            
##  [5] dmrseq_1.18.1               dplyr_1.1.2                
##  [7] gridExtra_2.3               scales_1.2.1               
##  [9] bsseq_1.34.0                SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [13] matrixStats_1.0.0           GenomicRanges_1.50.2       
## [15] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [17] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [19] ggplot2_3.4.2              
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.3                  circlize_0.4.15              
##   [3] AnnotationHub_3.6.0           BiocFileCache_2.6.1          
##   [5] plyr_1.8.8                    splines_4.2.1                
##   [7] BiocParallel_1.32.6           digest_0.6.33                
##   [9] ca_0.71.1                     foreach_1.5.2                
##  [11] htmltools_0.5.5               fansi_1.0.4                  
##  [13] magrittr_2.0.3                memoise_2.0.1                
##  [15] BSgenome_1.66.3               cluster_2.1.4                
##  [17] doParallel_1.0.17             openxlsx_4.2.5.2             
##  [19] tzdb_0.4.0                    limma_3.54.2                 
##  [21] ComplexHeatmap_2.14.0         Biostrings_2.66.0            
##  [23] readr_2.1.4                   R.utils_2.12.2               
##  [25] prettyunits_1.1.1             colorspace_2.1-0             
##  [27] blob_1.2.4                    rappdirs_0.3.3               
##  [29] xfun_0.39                     crayon_1.5.2                 
##  [31] RCurl_1.98-1.12               jsonlite_1.8.7               
##  [33] iterators_1.0.14              glue_1.6.2                   
##  [35] registry_0.5-1                gtable_0.3.3                 
##  [37] zlibbioc_1.44.0               XVector_0.38.0               
##  [39] V8_4.3.0                      GetoptLong_1.0.5             
##  [41] DelayedArray_0.24.0           Rhdf5lib_1.20.0              
##  [43] shape_1.4.6                   HDF5Array_1.26.0             
##  [45] DBI_1.1.3                     rngtools_1.5.2               
##  [47] randomcoloR_1.1.0.1           Rcpp_1.0.11                  
##  [49] xtable_1.8-4                  progress_1.2.2               
##  [51] clue_0.3-64                   bumphunter_1.40.0            
##  [53] bit_4.0.5                     httr_1.4.6                   
##  [55] RColorBrewer_1.1-3            ellipsis_0.3.2               
##  [57] farver_2.1.1                  pkgconfig_2.0.3              
##  [59] XML_3.99-0.14                 R.methodsS3_1.8.2            
##  [61] sass_0.4.7                    dbplyr_2.3.3                 
##  [63] locfit_1.5-9.7                utf8_1.2.3                   
##  [65] labeling_0.4.2                tidyselect_1.2.0             
##  [67] rlang_1.1.1                   reshape2_1.4.4               
##  [69] later_1.3.1                   AnnotationDbi_1.60.2         
##  [71] cellranger_1.1.0              munsell_0.5.0                
##  [73] BiocVersion_3.16.0            tools_4.2.1                  
##  [75] cachem_1.0.8                  cli_3.6.1                    
##  [77] generics_0.1.3                RSQLite_2.3.1                
##  [79] evaluate_0.21                 stringr_1.5.0                
##  [81] fastmap_1.1.1                 yaml_2.3.7                   
##  [83] outliers_0.15                 knitr_1.43                   
##  [85] bit64_4.0.5                   zip_2.3.0                    
##  [87] purrr_1.0.1                   KEGGREST_1.38.0              
##  [89] nlme_3.1-162                  doRNG_1.8.6                  
##  [91] sparseMatrixStats_1.10.0      mime_0.12                    
##  [93] R.oo_1.25.0                   xml2_1.3.5                   
##  [95] biomaRt_2.54.1                compiler_4.2.1               
##  [97] rstudioapi_0.15.0             filelock_1.0.2               
##  [99] curl_5.0.1                    png_0.1-8                    
## [101] interactiveDisplayBase_1.36.0 tibble_3.2.1                 
## [103] bslib_0.5.0                   stringi_1.7.12               
## [105] highr_0.10                    GenomicFeatures_1.50.4       
## [107] lattice_0.21-8                Matrix_1.6-0                 
## [109] permute_0.9-7                 vctrs_0.6.3                  
## [111] pillar_1.9.0                  lifecycle_1.0.3              
## [113] rhdf5filters_1.10.1           BiocManager_1.30.20          
## [115] GlobalOptions_0.1.2           jquerylib_0.1.4              
## [117] data.table_1.14.8             bitops_1.0-7                 
## [119] seriation_1.4.2               httpuv_1.6.11                
## [121] R6_2.5.1                      BiocIO_1.8.0                 
## [123] TSP_1.2-4                     promises_1.2.0.1             
## [125] codetools_0.2-19              gtools_3.9.4                 
## [127] rhdf5_2.42.1                  rjson_0.2.21                 
## [129] withr_2.5.0                   regioneR_1.30.0              
## [131] GenomicAlignments_1.34.1      Rsamtools_2.14.0             
## [133] GenomeInfoDbData_1.2.9        parallel_4.2.1               
## [135] hms_1.1.3                     grid_4.2.1                   
## [137] rmarkdown_2.23                DelayedMatrixStats_1.20.0    
## [139] Rtsne_0.16                    shiny_1.7.4.1                
## [141] restfulr_0.0.15

Generation of bsseq object with escapee CpGs (filtered with meth in hPGC > 30%)

Gaja Matassa

Date: January 28, 2025