library(ggplot2)
library(bsseq)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(scales)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:bsseq':
##
## combine
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:bsseq':
##
## combine
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dmrseq)
library(annotatr)
library(sechm)
library(rtracklayer)
source("../Methylation_helper.R")
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
<- params$OutputFolder OutputFolder
Loading of bsseq object with CpGs shared by 75% of the samples.
<- readRDS("~/DataDir/1.PreliminaryAnalysis/Output/WithSelectedEGCLCs/bsseq_obj_sharedby75ofall.rds") #getting bsseq object where sample selection and CpG filtering were already performed bsseq_obj
These escapees regions were filtered from the regions found in Tang et al. 2015. In particular, those covered by at least 10 CpGs in all (Tang et al. 2015) human primordial germ cell (hPGC) samples and with methylation levels above 30% in (Tang et al. 2015) week 5-9 hPGC samples were selected.
<- readxl::read_xlsx("~/DataDir/7.EscapeesExploration/Output/EscapeesFiltering/FirstFilteringRegions.xlsx") EscapeesRegions
<- makeGRangesFromDataFrame(EscapeesRegions, seqnames.field = "Chr", start.field = "Start", end.field = "End", keep.extra.columns = FALSE, na.rm=TRUE) EscapeesRegions_GRanges_hg19
We have 9856 escapees regions with a total of 735308 CpGs.
These regions are hg19 and need to be converted to hg38 to be compared with our data
= "~/DataDir/7.EscapeesExploration/hg19ToHg38.over.chain"
path = import.chain(path)
ch str(ch[[1]])
## Formal class 'ChainBlock' [package "rtracklayer"] with 6 slots
## ..@ ranges :Formal class 'IRanges' [package "IRanges"] with 6 slots
## .. .. ..@ start : int [1:1239] 16847851 16867628 16867672 16867743 16867854 16876217 16876254 16876567 16876586 16876682 ...
## .. .. ..@ width : int [1:1239] 19737 43 47 96 8362 36 312 18 95 33 ...
## .. .. ..@ NAMES : NULL
## .. .. ..@ elementType : chr "ANY"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## ..@ offset : int [1:1239] 480662 480702 480702 480726 480726 480726 480726 480726 480726 480726 ...
## ..@ score : int [1:16] -1063871317 61356972 1903041 1027915 538227 6814 4748 4372 4314 3439 ...
## ..@ space : chr [1:16] "chr22" "chr22" "chr22" "chr22" ...
## ..@ reversed: logi [1:16] FALSE TRUE FALSE FALSE TRUE FALSE ...
## ..@ length : int [1:16] 1129 18 2 26 50 2 1 1 2 1 ...
if ("UCSC" %in% seqlevelsStyle(EscapeesRegions_GRanges_hg19)) {
= liftOver(EscapeesRegions_GRanges_hg19, ch)
EscapeesRegions_GRanges_hg38 = unlist(EscapeesRegions_GRanges_hg38)
EscapeesRegions_GRanges_hg38 else{stop("The seqlevels style should be UCSC!!!")} }
length(EscapeesRegions_GRanges_hg38)
## [1] 10191
After the conversion we have 10191 escapees regions.
<- EscapeesRegions_GRanges_hg38 EscapeesRegions_GRanges
<- granges(bsseq_obj) %>% as.data.frame() #granges has 0-based position they seem 1-based. Look here for explanation: https://www.biostars.org/p/84686/
bsseq_ranges_df $end <- bsseq_ranges_df$start + 1 #to make them 0-based
bsseq_ranges_df<- makeGRangesFromDataFrame(bsseq_ranges_df, starts.in.df.are.0based = TRUE)
bsseq_granges
<- bsseq_obj
bsseq_obj_1based @rowRanges <- bsseq_granges
bsseq_obj_1basedgranges(bsseq_obj_1based)
## GRanges object with 3680616 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 10469 *
## [2] chr1 10471 *
## [3] chr1 10484 *
## [4] chr1 10489 *
## [5] chr1 10493 *
## ... ... ... ...
## [3680612] chrM 11647 *
## [3680613] chrM 11689 *
## [3680614] chrM 11692 *
## [3680615] chrM 11710 *
## [3680616] chrM 11716 *
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
This to obtain a bsseq object with only CpGs contained in the escapees regions
<- findOverlaps(bsseq_granges, EscapeesRegions_GRanges)
m
<- bsseq_obj_1based[queryHits(m),] bsseq_obj_escapees
With our targeted kit we are able to cover 9113 CpGs belonging to all the escapees regions (1.24%).
Let’s create also a bsseq object with non-escapee CpGs (all the other CpGs that are not in escapees regions)
<- bsseq_obj_1based[-queryHits(m),] bsseq_obj_NonEscapees
assays(bsseq_obj_escapees)$Meth <- getMeth(bsseq_obj_escapees, type = "raw")
assays(bsseq_obj_NonEscapees)$Meth <- getMeth(bsseq_obj_NonEscapees, type = "raw")
<- c("hiPSC_rep1" = "#dd1c77", "hiPSC_rep2" = "#dd1c77", "hiPSC_rep3" = "#dd1c77", "hiPSC_rep4" = "#dd1c77",
sample_colors "iMeLC_rep1" = "#377eb8", "iMeLC_rep2" = "#377eb8", "iMeLC_rep3" = "#377eb8", "iMeLC_rep4" = "#377eb8",
"hPGCLC_rep1" = "#4daf4a", "hPGCLC_rep2" = "#4daf4a", "hPGCLC_rep3" = "#4daf4a",
"hEGCLC_rep1" = "#ff7f00", "hEGCLC_rep2" = "#ff7f00", "hEGCLC_rep3" = "#ff7f00")
violin_plot(assays(bsseq_obj_escapees)$Meth, stat="median") + ggplot2::scale_fill_manual(values = sample_colors)
## No id variables; using all as measure variables
Saving:
DataFrame of Escapee regions after filtering and conversion to hg38 (excel format)
bsseq object subset with only CpGs in escapees regions where hPGCs have meth > 0.3
bsseq object subset with non-escapee CpGs (all the other CpGs that are not in escapees regions)
::write.xlsx(as.data.frame(EscapeesRegions_GRanges) %>% rename(chr = seqnames), file = paste0(OutputFolder, "EscapeeRegions_hg38.xlsx"))
openxlsxsaveRDS(bsseq_obj_escapees, paste0(OutputFolder, "bsseq_obj_escapees.rds"))
saveRDS(bsseq_obj_NonEscapees, paste0(OutputFolder, "bsseq_obj_NonEscapees.rds"))
date()
## [1] "Tue Jan 28 15:54:30 2025"
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] tidyr_1.3.0 rtracklayer_1.58.0
## [3] sechm_1.6.0 annotatr_1.24.0
## [5] dmrseq_1.18.1 dplyr_1.1.2
## [7] gridExtra_2.3 scales_1.2.1
## [9] bsseq_1.34.0 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.0.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] ggplot2_3.4.2
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.3 circlize_0.4.15
## [3] AnnotationHub_3.6.0 BiocFileCache_2.6.1
## [5] plyr_1.8.8 splines_4.2.1
## [7] BiocParallel_1.32.6 digest_0.6.33
## [9] ca_0.71.1 foreach_1.5.2
## [11] htmltools_0.5.5 fansi_1.0.4
## [13] magrittr_2.0.3 memoise_2.0.1
## [15] BSgenome_1.66.3 cluster_2.1.4
## [17] doParallel_1.0.17 openxlsx_4.2.5.2
## [19] tzdb_0.4.0 limma_3.54.2
## [21] ComplexHeatmap_2.14.0 Biostrings_2.66.0
## [23] readr_2.1.4 R.utils_2.12.2
## [25] prettyunits_1.1.1 colorspace_2.1-0
## [27] blob_1.2.4 rappdirs_0.3.3
## [29] xfun_0.39 crayon_1.5.2
## [31] RCurl_1.98-1.12 jsonlite_1.8.7
## [33] iterators_1.0.14 glue_1.6.2
## [35] registry_0.5-1 gtable_0.3.3
## [37] zlibbioc_1.44.0 XVector_0.38.0
## [39] V8_4.3.0 GetoptLong_1.0.5
## [41] DelayedArray_0.24.0 Rhdf5lib_1.20.0
## [43] shape_1.4.6 HDF5Array_1.26.0
## [45] DBI_1.1.3 rngtools_1.5.2
## [47] randomcoloR_1.1.0.1 Rcpp_1.0.11
## [49] xtable_1.8-4 progress_1.2.2
## [51] clue_0.3-64 bumphunter_1.40.0
## [53] bit_4.0.5 httr_1.4.6
## [55] RColorBrewer_1.1-3 ellipsis_0.3.2
## [57] farver_2.1.1 pkgconfig_2.0.3
## [59] XML_3.99-0.14 R.methodsS3_1.8.2
## [61] sass_0.4.7 dbplyr_2.3.3
## [63] locfit_1.5-9.7 utf8_1.2.3
## [65] labeling_0.4.2 tidyselect_1.2.0
## [67] rlang_1.1.1 reshape2_1.4.4
## [69] later_1.3.1 AnnotationDbi_1.60.2
## [71] cellranger_1.1.0 munsell_0.5.0
## [73] BiocVersion_3.16.0 tools_4.2.1
## [75] cachem_1.0.8 cli_3.6.1
## [77] generics_0.1.3 RSQLite_2.3.1
## [79] evaluate_0.21 stringr_1.5.0
## [81] fastmap_1.1.1 yaml_2.3.7
## [83] outliers_0.15 knitr_1.43
## [85] bit64_4.0.5 zip_2.3.0
## [87] purrr_1.0.1 KEGGREST_1.38.0
## [89] nlme_3.1-162 doRNG_1.8.6
## [91] sparseMatrixStats_1.10.0 mime_0.12
## [93] R.oo_1.25.0 xml2_1.3.5
## [95] biomaRt_2.54.1 compiler_4.2.1
## [97] rstudioapi_0.15.0 filelock_1.0.2
## [99] curl_5.0.1 png_0.1-8
## [101] interactiveDisplayBase_1.36.0 tibble_3.2.1
## [103] bslib_0.5.0 stringi_1.7.12
## [105] highr_0.10 GenomicFeatures_1.50.4
## [107] lattice_0.21-8 Matrix_1.6-0
## [109] permute_0.9-7 vctrs_0.6.3
## [111] pillar_1.9.0 lifecycle_1.0.3
## [113] rhdf5filters_1.10.1 BiocManager_1.30.20
## [115] GlobalOptions_0.1.2 jquerylib_0.1.4
## [117] data.table_1.14.8 bitops_1.0-7
## [119] seriation_1.4.2 httpuv_1.6.11
## [121] R6_2.5.1 BiocIO_1.8.0
## [123] TSP_1.2-4 promises_1.2.0.1
## [125] codetools_0.2-19 gtools_3.9.4
## [127] rhdf5_2.42.1 rjson_0.2.21
## [129] withr_2.5.0 regioneR_1.30.0
## [131] GenomicAlignments_1.34.1 Rsamtools_2.14.0
## [133] GenomeInfoDbData_1.2.9 parallel_4.2.1
## [135] hms_1.1.3 grid_4.2.1
## [137] rmarkdown_2.23 DelayedMatrixStats_1.20.0
## [139] Rtsne_0.16 shiny_1.7.4.1
## [141] restfulr_0.0.15