1. Environment setting

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ChIPseeker)
## 
## ChIPseeker v1.34.1  For help: https://guangchuangyu.github.io/software/ChIPseeker
## 
## If you use ChIPseeker in published research, please cite:
## Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring epigenomic datasets by ChIPseeker. Current Protocols 2022, 2(10): e585
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:plotly':
## 
##     rename
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:plotly':
## 
##     slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:plotly':
## 
##     select
## 
library(DT)
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:IRanges':
## 
##     shift
## The following objects are masked from 'package:S4Vectors':
## 
##     first, second
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(AnnotationDbi)
library(RColorBrewer)
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
## 
##     viridis_pal
library(topGO)
## Loading required package: graph
## Loading required package: GO.db
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
## 
## Attaching package: 'topGO'
## The following object is masked from 'package:IRanges':
## 
##     members
source("../5.DMRInterpretation/TopGO/TopGO_helper_meth.R")
GTFFile <- "~/DataDir/3.TwistBedAnn/Input/gencode.v35.annotation.gtf.gz"
txdb_v35 <- GenomicFeatures::makeTxDbFromGFF(GTFFile, format="gtf")
## Import genomic features from the file as a GRanges object ...
## OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
##   stop_codon. This information was ignored.
## OK

InputFolder <- params$InputFolder
OutputFolder <- params$OutputFolder

2. Loading data

2.1 BSSeq object

Loading of bsseq object of escapees generated in 2.Exploration_escapees.Rmd

bsseq_obj_escapees <- readRDS(paste0(params$InputFolder, "bsseq_obj_escapees.rds"))

2.2 Gene universe (genes annotated to twist bait regions)

TwistAnnotated <- readRDS(params$TwistAnnotated)
TwistAnnotated %>% head()
##   seqnames start   end width strand
## 1     chr1 10466 10585   120      *
## 2     chr1 10790 10909   120      *
## 3     chr1 15806 15925   120      *
## 4     chr1 18768 18887   120      *
## 5     chr1 29357 29476   120      *
## 6     chr1 36544 36663   120      *
##                                                               ann
## 1                                            cg14817997,cpg_inter
## 2                                 cg16269199,cg26928153,cpg_inter
## 3 CTCF_binding_site,cg13869341,cpg_inter,promoter_flanking_region
## 4                                            cg14008030,cpg_inter
## 5                      cg12045430,cg20826792,cpg_islands,promoter
## 6                   cg18231760,cpg_inter,promoter_flanking_region
##         annotation geneChr geneStart geneEnd geneLength geneStrand
## 1 Promoter (1-2kb)       1     11869   14409       2541          1
## 2 Promoter (<=1kb)       1     11869   14409       2541          1
## 3 Promoter (1-2kb)       1     17369   17436         68          2
## 4 Promoter (1-2kb)       1     17369   17436         68          2
## 5 Promoter (<=1kb)       1     29554   31097       1544          1
## 6 Promoter (<=1kb)       1     34554   36081       1528          2
##   ensembl_gene_id_version      transcriptId distanceToTSS
## 1       ENSG00000223972.5 ENST00000456328.2         -1284
## 2       ENSG00000223972.5 ENST00000456328.2          -960
## 3       ENSG00000278267.1 ENST00000619216.1          1511
## 4       ENSG00000278267.1 ENST00000619216.1         -1332
## 5       ENSG00000243485.5 ENST00000473358.1           -78
## 6       ENSG00000237613.2 ENST00000417324.1          -463
##                                                               flank_txIds
## 1                   ENST00000456328.2;ENST00000450305.2;ENST00000488147.1
## 2                   ENST00000456328.2;ENST00000450305.2;ENST00000488147.1
## 3 ENST00000456328.2;ENST00000450305.2;ENST00000488147.1;ENST00000619216.1
## 4                   ENST00000456328.2;ENST00000488147.1;ENST00000619216.1
## 5 ENST00000488147.1;ENST00000473358.1;ENST00000469289.1;ENST00000607096.1
## 6                                     ENST00000417324.1;ENST00000461467.1
##                                                             flank_geneIds
## 1                   ENSG00000223972.5;ENSG00000223972.5;ENSG00000227232.5
## 2                   ENSG00000223972.5;ENSG00000223972.5;ENSG00000227232.5
## 3 ENSG00000223972.5;ENSG00000223972.5;ENSG00000227232.5;ENSG00000278267.1
## 4                   ENSG00000223972.5;ENSG00000227232.5;ENSG00000278267.1
## 5 ENSG00000227232.5;ENSG00000243485.5;ENSG00000243485.5;ENSG00000284332.1
## 6                                     ENSG00000237613.2;ENSG00000237613.2
##   flank_gene_distances ensembl_gene_id hgnc_symbol external_gene_name
## 1        -1284;-1425;0 ENSG00000223972     DDX11L1            DDX11L1
## 2         -960;-1101;0 ENSG00000223972     DDX11L1            DDX11L1
## 3     3937;3796;0;1511 ENSG00000278267   MIR6859-1          MIR6859-1
## 4         6899;0;-1332 ENSG00000278267   MIR6859-1          MIR6859-1
## 5      0;-78;-791;-890 ENSG00000243485 MIR1302-2HG        MIR1302-2HG
## 6            -463;-471 ENSG00000237613     FAM138A            FAM138A
##                         gene_biotype
## 1 transcribed_unprocessed_pseudogene
## 2 transcribed_unprocessed_pseudogene
## 3                              miRNA
## 4                              miRNA
## 5                             lncRNA
## 6                             lncRNA
##                                                                        description
## 1   DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol;Acc:HGNC:37102]
## 2   DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol;Acc:HGNC:37102]
## 3                              microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:50039]
## 4                              microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:50039]
## 5                          MIR1302-2 host gene [Source:HGNC Symbol;Acc:HGNC:52482]
## 6 family with sequence similarity 138 member A [Source:HGNC Symbol;Acc:HGNC:32334]
##   chromosome_name start_position end_position
## 1               1          11869        14409
## 2               1          11869        14409
## 3               1          17369        17436
## 4               1          17369        17436
## 5               1          29554        31109
## 6               1          34554        36081

Genes_df <- TwistAnnotated[, c("ensembl_gene_id_version", "ensembl_gene_id", "hgnc_symbol", "gene_biotype")] %>% dplyr::distinct()%>%filter(!is.na(hgnc_symbol))%>%filter(!hgnc_symbol%in%"")
rownames(Genes_df) <- NULL

These are the genes with the same symbol but different ensembl id

Genes_df[Genes_df$hgnc_symbol%in%Genes_df[duplicated(Genes_df$hgnc_symbol), "hgnc_symbol"],]
##       ensembl_gene_id_version ensembl_gene_id hgnc_symbol
## 3309        ENSG00000285053.1 ENSG00000285053        TBCE
## 3310        ENSG00000284770.2 ENSG00000284770        TBCE
## 20449       ENSG00000237940.3 ENSG00000237940   LINC01238
## 20451       ENSG00000261186.2 ENSG00000261186   LINC01238
## 21924      ENSG00000206195.11 ENSG00000206195      DUXAP8
## 21926       ENSG00000271672.1 ENSG00000271672      DUXAP8
## 24581       ENSG00000284862.3 ENSG00000284862      CCDC39
## 24582      ENSG00000145075.13 ENSG00000145075      CCDC39
## 30308       ENSG00000272655.2 ENSG00000272655     POLR2J4
## 30309       ENSG00000214783.9 ENSG00000214783     POLR2J4
##                             gene_biotype
## 3309                      protein_coding
## 3310                      protein_coding
## 20449                             lncRNA
## 20451                             lncRNA
## 21924                             lncRNA
## 21926   transcribed_processed_pseudogene
## 24581                     protein_coding
## 24582                             lncRNA
## 30308 transcribed_unprocessed_pseudogene
## 30309                             lncRNA

Genes_df$entrez_gene_id <- as.vector(mapIds(org.Hs.eg.db, Genes_df$hgnc_symbol, column = "ENTREZID", keytype = "SYMBOL"))
## 'select()' returned 1:many mapping between keys and columns

nrow(Genes_df[is.na(Genes_df$entrez_gene_id),]) #number of genes without an entrez id
## [1] 421

GeneUniverse_symbol <- unique(Genes_df$hgnc_symbol)
GeneUniverse_entrezid <- na.omit(unique(Genes_df$entrez_gene_id))

3. Escapee CpG annotation and visualization with `ChIPseeker`

CpGs are now annotated to their associated genes and genomic regions using the annotatePeaks function of the ChIPseekerR package, using a TxDb object. The GenomicFeatures package uses TxDb objects to store transcript metadata. This class maps the 5’ and 3’ untranslated regions (UTRs), protein coding sequences (CDSs) and exons for a set of mRNA transcripts to their associated genome. All TxDb objects are backed by a SQLite database that manages genomic locations and the relationships between pre-processed mRNA transcripts, exons, protein coding sequences, and their related gene identifiers.

TxDb created by us

Remember that bsseq_obj_escapees (and bsseq_obj_NonEscapees) have 1-based positions

bsseq_obj_escapees
## Loading required package: bsseq
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
## An object of type 'BSseq' with
##   9113 methylation loci
##   14 samples
## has not been smoothed
## All assays are in-memory

granges(bsseq_obj_escapees)
## GRanges object with 9113 ranges and 0 metadata columns:
##          seqnames    ranges strand
##             <Rle> <IRanges>  <Rle>
##      [1]     chr1   2155089      *
##      [2]     chr1   2155133      *
##      [3]     chr1   2812643      *
##      [4]     chr1   2812659      *
##      [5]     chr1   2812704      *
##      ...      ...       ...    ...
##   [9109]     chrX 142205308      *
##   [9110]     chrX 147279247      *
##   [9111]     chrX 147279251      *
##   [9112]     chrX 147279260      *
##   [9113]     chrX 154807851      *
##   -------
##   seqinfo: 25 sequences from an unspecified genome; no seqlengths

EscapeeAnno <- annotatePeak(
  granges(bsseq_obj_escapees),
  tssRegion = c(-3000, 3000),
  TxDb = txdb_v35,
  level = "transcript", #level = "gene"
  assignGenomicAnnotation = TRUE,
  genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
    "Downstream", "Intergenic"),
  overlap = "TSS",
  verbose = TRUE,
  columns = c("ENTREZID", "ENSEMBL", "SYMBOL", "GENENAME")
)
## >> preparing features information...      2025-01-28 04:07:26 PM 
## >> identifying nearest features...        2025-01-28 04:07:30 PM 
## >> calculating distance from peak to TSS...   2025-01-28 04:07:31 PM 
## >> assigning genomic annotation...        2025-01-28 04:07:31 PM 
## >> assigning chromosome lengths           2025-01-28 04:08:00 PM 
## >> done...                    2025-01-28 04:08:00 PM

ChIPseeker::plotAnnoBar(EscapeeAnno)

ChIPseeker::plotAnnoPie(EscapeeAnno)

ChIPseeker::vennpie(EscapeeAnno)

ChIPseeker::plotDistToTSS(EscapeeAnno)

From ensembl ids to gene symbols, entrez ids and metadata

Host <- "https://aug2020.archive.ensembl.org"
Specie <- "hsapiens"
BioMart <- biomaRt::listMarts(host=Host)[1,1]
Version <- biomaRt::listMarts(host=Host)[1,2]
Mart <- biomaRt::useMart(host=Host, biomart=BioMart, version=Version, 
                         dataset=paste0(Specie,'_gene_ensembl'))

EscapeeGenes_ensembl = unique(as.data.frame(EscapeeAnno)$geneId) #Genes assigned to the escapees regions by chipseeker

Attributes = c("ensembl_gene_id_version", "ensembl_gene_id", "hgnc_symbol", "gene_biotype")

EscapeeGeneAnnotation <- biomaRt::getBM(mart=Mart, filters='ensembl_gene_id_version', values=EscapeeGenes_ensembl, attributes=Attributes, uniqueRows=TRUE)
EscapeeGeneAnnotation <- EscapeeGeneAnnotation %>% dplyr::distinct() #%>%filter(!is.na(hgnc_symbol)) %>% filter(!hgnc_symbol%in%"")

EscapeeGenes_ensembl[!EscapeeGenes_ensembl %in% EscapeeGeneAnnotation$ensembl_gene_id_version] %>% head() #ChipSeeker annotation that are not present in the biomart
## character(0)

EscapeeGenes_symbol = EscapeeGeneAnnotation %>% filter(!is.na(hgnc_symbol)) %>% filter(!hgnc_symbol%in%"") %>% pull(hgnc_symbol) %>% unique()

EscapeeGenes_entrez = mapIds(org.Hs.eg.db, keys = EscapeeGenes_symbol, column = "ENTREZID", keytype = "SYMBOL") %>% unique()
## 'select()' returned 1:1 mapping between keys and columns

AnnotatedEscapees <- dplyr::left_join(as.data.frame(EscapeeAnno), EscapeeGeneAnnotation, by=c('geneId' = 'ensembl_gene_id_version'))

4. Functional enrichment analysis

4.1 Selection of modulated genes and generation of gene vector

GeneVectors <- topGOGeneVectors_meth_v2(gene_symbols = EscapeeGenes_symbol, genomic_type = "all", gene_type = "all", Universe = GeneUniverse_symbol) %>% unlist()
## [1] "All genomic regions and gene types will be kept"
## [1] "There are some genes symbols which are not in the gene universe! They will not be considered..."

Genes associated to escapee CpGs that are not present in the universe:

not_present <- EscapeeGeneAnnotation %>% filter(hgnc_symbol %in% unique(na.omit(EscapeeGenes_symbol))[!unique(na.omit(EscapeeGenes_symbol)) %in%  GeneUniverse_symbol])
not_present

##   ensembl_gene_id_version ensembl_gene_id hgnc_symbol gene_biotype
## 1       ENSG00000278970.2 ENSG00000278970        HEIH       lncRNA

These are the CpGs annotated with those genes

EscapeeAnno@anno %>% as.data.frame() %>% filter(geneId %in% not_present$ensembl_gene_id_version)
##   seqnames     start       end width strand       annotation geneChr geneStart
## 1     chr5 180830679 180830679     1      * Promoter (<=1kb)       5 180826871
## 2     chr5 180830691 180830691     1      * Promoter (<=1kb)       5 180826871
## 3     chr5 180830697 180830697     1      * Promoter (<=1kb)       5 180826871
## 4     chr5 180830795 180830795     1      * Promoter (<=1kb)       5 180826871
## 5     chr5 180830797 180830797     1      * Promoter (<=1kb)       5 180826871
##     geneEnd geneLength geneStrand            geneId      transcriptId
## 1 180830659       3789          2 ENSG00000278970.2 ENST00000651563.1
## 2 180830659       3789          2 ENSG00000278970.2 ENST00000651563.1
## 3 180830659       3789          2 ENSG00000278970.2 ENST00000651563.1
## 4 180830659       3789          2 ENSG00000278970.2 ENST00000651563.1
## 5 180830659       3789          2 ENSG00000278970.2 ENST00000651563.1
##   distanceToTSS
## 1           -20
## 2           -32
## 3           -38
## 4          -136
## 5          -138

BpEval <- ifelse(length(grep('BP', params$TopGO))!=0, TRUE, FALSE)
MfEval <- ifelse(length(grep('MF', params$TopGO))!=0, TRUE, FALSE)
CcEval <- ifelse(length(grep('CC', params$TopGO))!=0, TRUE, FALSE)

Therefore:

universe genes: 36011 genes
escapee genes: 1039 genes

4.2 TopGO analysis: Biological Process

On the basis of the analysis settings, the enrichment for Biological Process IS performed.

# I generate a list that contains the association between each gene and the GO terms that are associated to it
BPann <- topGO::annFUN.org(whichOnto="BP", feasibleGenes=names(GeneVectors), 
                           mapping="org.Hs.eg.db", ID="symbol") %>% inverseList()

# Wrapper function for topGO analysis 
ResBPAll <- topGOResults_new(Genes=GeneVectors, gene2GO=BPann, ontology='BP', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=12, geneTh=4,
                         saveRes=TRUE, outDir=paste0(OutputFolder), fileName='BPAll', maxAnn = 500)
## Gene vector contains levels: 0,1
## 
## Building most specific GOs .....
##  ( 12544 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 15945 GO terms and 36197 relations. )
## 
## Annotating nodes ...............
##  ( 18089 genes annotated to the GO terms. )
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 4417 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 18:  2 nodes to be scored    (0 eliminated genes)
## 
##   Level 17:  4 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  9 nodes to be scored    (45 eliminated genes)
## 
##   Level 15:  20 nodes to be scored   (101 eliminated genes)
## 
##   Level 14:  43 nodes to be scored   (264 eliminated genes)
## 
##   Level 13:  75 nodes to be scored   (581 eliminated genes)
## 
##   Level 12:  137 nodes to be scored  (1490 eliminated genes)
## 
##   Level 11:  289 nodes to be scored  (3905 eliminated genes)
## 
##   Level 10:  477 nodes to be scored  (6204 eliminated genes)
## 
##   Level 9:   628 nodes to be scored  (8193 eliminated genes)
## 
##   Level 8:   679 nodes to be scored  (10958 eliminated genes)
## 
##   Level 7:   726 nodes to be scored  (13312 eliminated genes)
## 
##   Level 6:   620 nodes to be scored  (15493 eliminated genes)
## 
##   Level 5:   383 nodes to be scored  (16686 eliminated genes)
## 
##   Level 4:   216 nodes to be scored  (17396 eliminated genes)
## 
##   Level 3:   90 nodes to be scored   (17691 eliminated genes)
## 
##   Level 2:   18 nodes to be scored   (17829 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (17896 eliminated genes)

Result visualization: Barplot

nterms=20
cols <- c(All='forestgreen', Down='blue', Up='red')
pals <- lapply(cols, {function(el) grDevices::colorRampPalette(c('gold', el))(nterms)})

if(nrow(ResBPAll$ResSel)>0){
  topGOBarplot_new(TopGORes=ResBPAll$ResSel, terms=nterms, pvalTh=0.01, plotTitle=paste0("All Escapees genes related BP terms"),
                             palette=pals$All, flip_x=FALSE)
}

Result visualization: Bubbleplot

if(nrow(ResBPAll$ResSel)>0){
  bubbleplot(ResBPAll$ResSel, terms=nterms, Ont = "BP", pvalTh=0.01, plotTitle="All Escapees genes related BP terms")
  }
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Result visualization: table of genes in top terms

if(nrow(ResBPAll$ResSel)>0){
  table_GenesInTerm_BPAll <- TableGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms=nterms)
}

table_GenesInTerm_BPAll %>% dplyr::select(-c(Score, GOid)) %>%
    datatable(class = 'hover', rownames = FALSE, extension='Buttons', escape = FALSE, caption = paste0("All genes in top ", nterms, " BP terms"), options = list(pageLength=10, dom='Bfrtip', autoWidth=TRUE, buttons=list(c('csv', 'excel'))))

4.3 TopGO analysis: Molecular Function

On the basis of the analysis settings, the enrichment for Molecular Function IS performed.

# I generate a list that contains the association between each gene and the GO terms that are associated to it
MFann <- topGO::annFUN.org(whichOnto="MF", feasibleGenes=names(GeneVectors), 
                           mapping="org.Hs.eg.db", ID="symbol") %>% inverseList()

# Wrapper function for topGO analysis 
ResMFAll <- topGOResults_new(Genes=GeneVectors, gene2GO=MFann, ontology='MF', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=12, geneTh=4,
                         saveRes=TRUE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)
## Gene vector contains levels: 0,1
## 
## Building most specific GOs .....
##  ( 4502 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 4948 GO terms and 6418 relations. )
## 
## Annotating nodes ...............
##  ( 17822 genes annotated to the GO terms. )
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 766 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 11:  7 nodes to be scored    (0 eliminated genes)
## 
##   Level 10:  12 nodes to be scored   (0 eliminated genes)
## 
##   Level 9:   34 nodes to be scored   (205 eliminated genes)
## 
##   Level 8:   60 nodes to be scored   (1511 eliminated genes)
## 
##   Level 7:   100 nodes to be scored  (3866 eliminated genes)
## 
##   Level 6:   150 nodes to be scored  (4653 eliminated genes)
## 
##   Level 5:   175 nodes to be scored  (6769 eliminated genes)
## 
##   Level 4:   160 nodes to be scored  (10746 eliminated genes)
## 
##   Level 3:   51 nodes to be scored   (13969 eliminated genes)
## 
##   Level 2:   16 nodes to be scored   (15191 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (17628 eliminated genes)

Result visualization: Barplot

nterms=20
cols <- c(All='forestgreen', Down='blue', Up='red')
pals <- lapply(cols, {function(el) grDevices::colorRampPalette(c('gold', el))(nterms)})

if(nrow(ResMFAll$ResSel)>0){
  topGOBarplot_new(TopGORes=ResMFAll$ResSel, terms=nterms, pvalTh=0.01, plotTitle=paste0("All Escapees genes related MF terms"),
                             palette=pals$All, flip_x=FALSE)
}

Result visualization: Bubbleplot

if(nrow(ResMFAll$ResSel)>0){
  bubbleplot(ResMFAll$ResSel, terms=nterms, Ont = "MF", pvalTh=0.01, plotTitle="All Escapees genes related MF terms")
  }

Result visualization: table of genes in top terms

if(nrow(ResMFAll$ResSel)>0){
  table_GenesInTerm_MFAll <- TableGenesInTerm(ResMFAll$ResSel, ResMFAll$GOdata, nterms=nterms)
}

table_GenesInTerm_MFAll %>% dplyr::select(-c(Score, GOid)) %>%
    datatable(class = 'hover', rownames = FALSE, extension='Buttons', escape = FALSE, caption = paste0("All genes in top ", nterms, " MF terms"), options = list(pageLength=10, dom='Bfrtip', autoWidth=TRUE, buttons=list(c('csv', 'excel'))))

4.4 TopGO analysis: Cellular Component

On the basis of the analysis settings, the enrichment for Cellular Component IS performed.

# I generate a list that contains the association between each gene and the GO terms that are associated to it
CCann <- topGO::annFUN.org(whichOnto="CC", feasibleGenes=names(GeneVectors), 
                           mapping="org.Hs.eg.db", ID="symbol") %>% inverseList()

# Wrapper function for topGO analysis 
ResCCAll <- topGOResults_new(Genes=GeneVectors, gene2GO=CCann, ontology='CC', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=12, geneTh=4,
                         saveRes=TRUE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)
## Gene vector contains levels: 0,1
## 
## Building most specific GOs .....
##  ( 1828 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 2020 GO terms and 3396 relations. )
## 
## Annotating nodes ...............
##  ( 18985 genes annotated to the GO terms. )
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 543 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 11:  17 nodes to be scored   (0 eliminated genes)
## 
##   Level 10:  53 nodes to be scored   (0 eliminated genes)
## 
##   Level 9:   77 nodes to be scored   (793 eliminated genes)
## 
##   Level 8:   80 nodes to be scored   (3062 eliminated genes)
## 
##   Level 7:   82 nodes to be scored   (5871 eliminated genes)
## 
##   Level 6:   81 nodes to be scored   (10292 eliminated genes)
## 
##   Level 5:   67 nodes to be scored   (12598 eliminated genes)
## 
##   Level 4:   45 nodes to be scored   (15415 eliminated genes)
## 
##   Level 3:   38 nodes to be scored   (17747 eliminated genes)
## 
##   Level 2:   2 nodes to be scored    (18615 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (18870 eliminated genes)

Result visualization: Barplot

nterms=20
cols <- c(All='forestgreen', Down='blue', Up='red')
pals <- lapply(cols, {function(el) grDevices::colorRampPalette(c('gold', el))(nterms)})

if(nrow(ResCCAll$ResSel)>0){
  topGOBarplot_new(TopGORes=ResCCAll$ResSel, terms=nterms, pvalTh=0.01, plotTitle=paste0("All Escapees genes related CC terms"),
                             palette=pals$All, flip_x=FALSE)
}

Result visualization: Bubbleplot

if(nrow(ResCCAll$ResSel)>0){
  bubbleplot(ResCCAll$ResSel, terms=nterms, Ont = "CC", pvalTh=0.01, plotTitle="All Escapees genes related CC terms")
  }

Result visualization: table of genes in top terms

if(nrow(ResCCAll$ResSel)>0){
  table_GenesInTerm_CCAll <- TableGenesInTerm(ResCCAll$ResSel, ResCCAll$GOdata, nterms=nterms)
}

table_GenesInTerm_CCAll %>% dplyr::select(-c(Score, GOid)) %>%
    datatable(class = 'hover', rownames = FALSE, extension='Buttons', escape = FALSE, caption = paste0("All genes in top ", nterms, " CC terms"), options = list(pageLength=10, dom='Bfrtip', autoWidth=TRUE, buttons=list(c('csv', 'excel'))))

5. Saving and Session Info

save.image(paste0(OutputFolder, "AnnotationAndFunctionalEnv.RData"))
saveRDS(AnnotatedEscapees, paste0(OutputFolder, "AnnotatedEscapees.rds"))

file_conn <- file(paste0(OutputFolder, "EscapeeGenes_ensembl.txt"), "w")
for (element in gsub(x = EscapeeGenes_ensembl, pattern = "\\..*", replacement = "")) {
  cat(element, "\n", file = file_conn)
}
close(file_conn)

file_conn <- file(paste0(OutputFolder, "EscapeeGenes_symbol.txt"), "w")
for (element in EscapeeGenes_symbol) {
  cat(element, "\n", file = file_conn)
}
close(file_conn)

file_conn <- file(paste0(OutputFolder, "GeneUniverse_symbol.txt"), "w")
for (element in GeneUniverse_symbol) {
  cat(element, "\n", file = file_conn)
}
close(file_conn)

SessionInfo <- sessionInfo()
Date <- date()

Date
## [1] "Tue Jan 28 16:12:15 2025"
SessionInfo
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] bsseq_1.34.0                SummarizedExperiment_1.28.0
##  [3] MatrixGenerics_1.10.0       matrixStats_1.0.0          
##  [5] GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
##  [7] topGO_2.50.0                SparseM_1.81               
##  [9] GO.db_3.16.0                graph_1.76.0               
## [11] viridis_0.6.2               viridisLite_0.4.2          
## [13] RColorBrewer_1.1-3          data.table_1.14.8          
## [15] DT_0.28                     org.Hs.eg.db_3.16.0        
## [17] AnnotationDbi_1.60.2        IRanges_2.32.0             
## [19] S4Vectors_0.36.2            Biobase_2.58.0             
## [21] BiocGenerics_0.44.0         ChIPseeker_1.34.1          
## [23] gridExtra_2.3               scales_1.2.1               
## [25] dplyr_1.1.2                 plotly_4.10.2              
## [27] ggplot2_3.4.2              
## 
## loaded via a namespace (and not attached):
##   [1] shadowtext_0.1.2                       
##   [2] fastmatch_1.1-3                        
##   [3] BiocFileCache_2.6.1                    
##   [4] plyr_1.8.8                             
##   [5] igraph_1.5.0                           
##   [6] lazyeval_0.2.2                         
##   [7] splines_4.2.1                          
##   [8] crosstalk_1.2.0                        
##   [9] BiocParallel_1.32.6                    
##  [10] digest_0.6.33                          
##  [11] yulab.utils_0.0.6                      
##  [12] htmltools_0.5.5                        
##  [13] GOSemSim_2.24.0                        
##  [14] fansi_1.0.4                            
##  [15] magrittr_2.0.3                         
##  [16] memoise_2.0.1                          
##  [17] BSgenome_1.66.3                        
##  [18] limma_3.54.2                           
##  [19] Biostrings_2.66.0                      
##  [20] graphlayouts_0.8.4                     
##  [21] R.utils_2.12.2                         
##  [22] enrichplot_1.18.4                      
##  [23] prettyunits_1.1.1                      
##  [24] colorspace_2.1-0                       
##  [25] blob_1.2.4                             
##  [26] rappdirs_0.3.3                         
##  [27] ggrepel_0.9.3                          
##  [28] xfun_0.39                              
##  [29] crayon_1.5.2                           
##  [30] RCurl_1.98-1.12                        
##  [31] jsonlite_1.8.7                         
##  [32] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [33] scatterpie_0.1.8                       
##  [34] ape_5.7-1                              
##  [35] glue_1.6.2                             
##  [36] polyclip_1.10-4                        
##  [37] gtable_0.3.3                           
##  [38] zlibbioc_1.44.0                        
##  [39] XVector_0.38.0                         
##  [40] DelayedArray_0.24.0                    
##  [41] Rhdf5lib_1.20.0                        
##  [42] HDF5Array_1.26.0                       
##  [43] DOSE_3.24.2                            
##  [44] DBI_1.1.3                              
##  [45] Rcpp_1.0.11                            
##  [46] plotrix_3.8-2                          
##  [47] progress_1.2.2                         
##  [48] gridGraphics_0.5-1                     
##  [49] tidytree_0.4.2                         
##  [50] bit_4.0.5                              
##  [51] htmlwidgets_1.6.2                      
##  [52] httr_1.4.6                             
##  [53] fgsea_1.24.0                           
##  [54] gplots_3.1.3                           
##  [55] ellipsis_0.3.2                         
##  [56] R.methodsS3_1.8.2                      
##  [57] pkgconfig_2.0.3                        
##  [58] XML_3.99-0.14                          
##  [59] farver_2.1.1                           
##  [60] sass_0.4.7                             
##  [61] dbplyr_2.3.3                           
##  [62] locfit_1.5-9.7                         
##  [63] utf8_1.2.3                             
##  [64] labeling_0.4.2                         
##  [65] ggplotify_0.1.0                        
##  [66] tidyselect_1.2.0                       
##  [67] rlang_1.1.1                            
##  [68] reshape2_1.4.4                         
##  [69] munsell_0.5.0                          
##  [70] tools_4.2.1                            
##  [71] cachem_1.0.8                           
##  [72] cli_3.6.1                              
##  [73] generics_0.1.3                         
##  [74] RSQLite_2.3.1                          
##  [75] evaluate_0.21                          
##  [76] stringr_1.5.0                          
##  [77] fastmap_1.1.1                          
##  [78] yaml_2.3.7                             
##  [79] ggtree_3.6.2                           
##  [80] knitr_1.43                             
##  [81] bit64_4.0.5                            
##  [82] tidygraph_1.2.3                        
##  [83] caTools_1.18.2                         
##  [84] purrr_1.0.1                            
##  [85] KEGGREST_1.38.0                        
##  [86] ggraph_2.1.0                           
##  [87] sparseMatrixStats_1.10.0               
##  [88] nlme_3.1-162                           
##  [89] R.oo_1.25.0                            
##  [90] aplot_0.1.10                           
##  [91] xml2_1.3.5                             
##  [92] biomaRt_2.54.1                         
##  [93] compiler_4.2.1                         
##  [94] rstudioapi_0.15.0                      
##  [95] filelock_1.0.2                         
##  [96] curl_5.0.1                             
##  [97] png_0.1-8                              
##  [98] treeio_1.23.1                          
##  [99] tibble_3.2.1                           
## [100] tweenr_2.0.2                           
## [101] bslib_0.5.0                            
## [102] stringi_1.7.12                         
## [103] highr_0.10                             
## [104] GenomicFeatures_1.50.4                 
## [105] lattice_0.21-8                         
## [106] Matrix_1.6-0                           
## [107] permute_0.9-7                          
## [108] vctrs_0.6.3                            
## [109] rhdf5filters_1.10.1                    
## [110] pillar_1.9.0                           
## [111] lifecycle_1.0.3                        
## [112] jquerylib_0.1.4                        
## [113] cowplot_1.1.1                          
## [114] bitops_1.0-7                           
## [115] patchwork_1.1.2                        
## [116] rtracklayer_1.58.0                     
## [117] qvalue_2.30.0                          
## [118] R6_2.5.1                               
## [119] BiocIO_1.8.0                           
## [120] KernSmooth_2.23-22                     
## [121] codetools_0.2-19                       
## [122] gtools_3.9.4                           
## [123] boot_1.3-28.1                          
## [124] MASS_7.3-60                            
## [125] rhdf5_2.42.1                           
## [126] rjson_0.2.21                           
## [127] withr_2.5.0                            
## [128] GenomicAlignments_1.34.1               
## [129] Rsamtools_2.14.0                       
## [130] GenomeInfoDbData_1.2.9                 
## [131] parallel_4.2.1                         
## [132] hms_1.1.3                              
## [133] grid_4.2.1                             
## [134] ggfun_0.0.9                            
## [135] tidyr_1.3.0                            
## [136] HDO.db_0.99.1                          
## [137] DelayedMatrixStats_1.20.0              
## [138] rmarkdown_2.23                         
## [139] ggforce_0.4.1                          
## [140] restfulr_0.0.15

Annotation of escapee CpGs

Gaja Matassa

Date: January 28, 2025

1. Environment setting

2. Loading data

2.1 BSSeq object

2.2 Gene universe (genes annotated to twist bait regions)

3. Escapee CpG annotation and visualization with `ChIPseeker`

4. Functional enrichment analysis

4.1 Selection of modulated genes and generation of gene vector

4.2 TopGO analysis: Biological Process

Result visualization: Barplot

Result visualization: Bubbleplot

Result visualization: table of genes in top terms

4.3 TopGO analysis: Molecular Function

Result visualization: Barplot

Result visualization: Bubbleplot

Result visualization: table of genes in top terms

4.4 TopGO analysis: Cellular Component

Result visualization: Barplot

Result visualization: Bubbleplot

Result visualization: table of genes in top terms

5. Saving and Session Info

Annotation of escapee CpGs

Gaja Matassa

Date: January 28, 2025

1. Environment setting

2. Loading data

2.1 BSSeq object

2.2 Gene universe (genes annotated to twist bait regions)

3. Escapee CpG annotation and visualization with ChIPseeker

4. Functional enrichment analysis

4.1 Selection of modulated genes and generation of gene vector

4.2 TopGO analysis: Biological Process

Result visualization: Barplot

Result visualization: Bubbleplot

Result visualization: table of genes in top terms

4.3 TopGO analysis: Molecular Function

Result visualization: Barplot

Result visualization: Bubbleplot

Result visualization: table of genes in top terms

4.4 TopGO analysis: Cellular Component

Result visualization: Barplot

Result visualization: Bubbleplot

Result visualization: table of genes in top terms

5. Saving and Session Info

3. Escapee CpG annotation and visualization with `ChIPseeker`