library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ChIPseeker)
##
## ChIPseeker v1.34.1 For help: https://guangchuangyu.github.io/software/ChIPseeker
##
## If you use ChIPseeker in published research, please cite:
## Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring epigenomic datasets by ChIPseeker. Current Protocols 2022, 2(10): e585
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:plotly':
##
## rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:plotly':
##
## slice
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:plotly':
##
## select
##
library(DT)
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:IRanges':
##
## shift
## The following objects are masked from 'package:S4Vectors':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(AnnotationDbi)
library(RColorBrewer)
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
library(topGO)
## Loading required package: graph
## Loading required package: GO.db
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## groupGOTerms: GOBPTerm, GOMFTerm, GOCCTerm environments built.
##
## Attaching package: 'topGO'
## The following object is masked from 'package:IRanges':
##
## members
source("../5.DMRInterpretation/TopGO/TopGO_helper_meth.R")
<- "~/DataDir/3.TwistBedAnn/Input/gencode.v35.annotation.gtf.gz"
GTFFile <- GenomicFeatures::makeTxDbFromGFF(GTFFile, format="gtf")
txdb_v35 ## Import genomic features from the file as a GRanges object ...
## OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
## stop_codon. This information was ignored.
## OK
<- params$InputFolder
InputFolder <- params$OutputFolder OutputFolder
Loading of bsseq object of escapees generated in 2.Exploration_escapees.Rmd
<- readRDS(paste0(params$InputFolder, "bsseq_obj_escapees.rds")) bsseq_obj_escapees
<- readRDS(params$TwistAnnotated)
TwistAnnotated %>% head()
TwistAnnotated ## seqnames start end width strand
## 1 chr1 10466 10585 120 *
## 2 chr1 10790 10909 120 *
## 3 chr1 15806 15925 120 *
## 4 chr1 18768 18887 120 *
## 5 chr1 29357 29476 120 *
## 6 chr1 36544 36663 120 *
## ann
## 1 cg14817997,cpg_inter
## 2 cg16269199,cg26928153,cpg_inter
## 3 CTCF_binding_site,cg13869341,cpg_inter,promoter_flanking_region
## 4 cg14008030,cpg_inter
## 5 cg12045430,cg20826792,cpg_islands,promoter
## 6 cg18231760,cpg_inter,promoter_flanking_region
## annotation geneChr geneStart geneEnd geneLength geneStrand
## 1 Promoter (1-2kb) 1 11869 14409 2541 1
## 2 Promoter (<=1kb) 1 11869 14409 2541 1
## 3 Promoter (1-2kb) 1 17369 17436 68 2
## 4 Promoter (1-2kb) 1 17369 17436 68 2
## 5 Promoter (<=1kb) 1 29554 31097 1544 1
## 6 Promoter (<=1kb) 1 34554 36081 1528 2
## ensembl_gene_id_version transcriptId distanceToTSS
## 1 ENSG00000223972.5 ENST00000456328.2 -1284
## 2 ENSG00000223972.5 ENST00000456328.2 -960
## 3 ENSG00000278267.1 ENST00000619216.1 1511
## 4 ENSG00000278267.1 ENST00000619216.1 -1332
## 5 ENSG00000243485.5 ENST00000473358.1 -78
## 6 ENSG00000237613.2 ENST00000417324.1 -463
## flank_txIds
## 1 ENST00000456328.2;ENST00000450305.2;ENST00000488147.1
## 2 ENST00000456328.2;ENST00000450305.2;ENST00000488147.1
## 3 ENST00000456328.2;ENST00000450305.2;ENST00000488147.1;ENST00000619216.1
## 4 ENST00000456328.2;ENST00000488147.1;ENST00000619216.1
## 5 ENST00000488147.1;ENST00000473358.1;ENST00000469289.1;ENST00000607096.1
## 6 ENST00000417324.1;ENST00000461467.1
## flank_geneIds
## 1 ENSG00000223972.5;ENSG00000223972.5;ENSG00000227232.5
## 2 ENSG00000223972.5;ENSG00000223972.5;ENSG00000227232.5
## 3 ENSG00000223972.5;ENSG00000223972.5;ENSG00000227232.5;ENSG00000278267.1
## 4 ENSG00000223972.5;ENSG00000227232.5;ENSG00000278267.1
## 5 ENSG00000227232.5;ENSG00000243485.5;ENSG00000243485.5;ENSG00000284332.1
## 6 ENSG00000237613.2;ENSG00000237613.2
## flank_gene_distances ensembl_gene_id hgnc_symbol external_gene_name
## 1 -1284;-1425;0 ENSG00000223972 DDX11L1 DDX11L1
## 2 -960;-1101;0 ENSG00000223972 DDX11L1 DDX11L1
## 3 3937;3796;0;1511 ENSG00000278267 MIR6859-1 MIR6859-1
## 4 6899;0;-1332 ENSG00000278267 MIR6859-1 MIR6859-1
## 5 0;-78;-791;-890 ENSG00000243485 MIR1302-2HG MIR1302-2HG
## 6 -463;-471 ENSG00000237613 FAM138A FAM138A
## gene_biotype
## 1 transcribed_unprocessed_pseudogene
## 2 transcribed_unprocessed_pseudogene
## 3 miRNA
## 4 miRNA
## 5 lncRNA
## 6 lncRNA
## description
## 1 DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol;Acc:HGNC:37102]
## 2 DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol;Acc:HGNC:37102]
## 3 microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:50039]
## 4 microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:50039]
## 5 MIR1302-2 host gene [Source:HGNC Symbol;Acc:HGNC:52482]
## 6 family with sequence similarity 138 member A [Source:HGNC Symbol;Acc:HGNC:32334]
## chromosome_name start_position end_position
## 1 1 11869 14409
## 2 1 11869 14409
## 3 1 17369 17436
## 4 1 17369 17436
## 5 1 29554 31109
## 6 1 34554 36081
<- TwistAnnotated[, c("ensembl_gene_id_version", "ensembl_gene_id", "hgnc_symbol", "gene_biotype")] %>% dplyr::distinct()%>%filter(!is.na(hgnc_symbol))%>%filter(!hgnc_symbol%in%"")
Genes_df rownames(Genes_df) <- NULL
These are the genes with the same symbol but different ensembl id
$hgnc_symbol%in%Genes_df[duplicated(Genes_df$hgnc_symbol), "hgnc_symbol"],]
Genes_df[Genes_df## ensembl_gene_id_version ensembl_gene_id hgnc_symbol
## 3309 ENSG00000285053.1 ENSG00000285053 TBCE
## 3310 ENSG00000284770.2 ENSG00000284770 TBCE
## 20449 ENSG00000237940.3 ENSG00000237940 LINC01238
## 20451 ENSG00000261186.2 ENSG00000261186 LINC01238
## 21924 ENSG00000206195.11 ENSG00000206195 DUXAP8
## 21926 ENSG00000271672.1 ENSG00000271672 DUXAP8
## 24581 ENSG00000284862.3 ENSG00000284862 CCDC39
## 24582 ENSG00000145075.13 ENSG00000145075 CCDC39
## 30308 ENSG00000272655.2 ENSG00000272655 POLR2J4
## 30309 ENSG00000214783.9 ENSG00000214783 POLR2J4
## gene_biotype
## 3309 protein_coding
## 3310 protein_coding
## 20449 lncRNA
## 20451 lncRNA
## 21924 lncRNA
## 21926 transcribed_processed_pseudogene
## 24581 protein_coding
## 24582 lncRNA
## 30308 transcribed_unprocessed_pseudogene
## 30309 lncRNA
$entrez_gene_id <- as.vector(mapIds(org.Hs.eg.db, Genes_df$hgnc_symbol, column = "ENTREZID", keytype = "SYMBOL"))
Genes_df## 'select()' returned 1:many mapping between keys and columns
nrow(Genes_df[is.na(Genes_df$entrez_gene_id),]) #number of genes without an entrez id
## [1] 421
<- unique(Genes_df$hgnc_symbol)
GeneUniverse_symbol <- na.omit(unique(Genes_df$entrez_gene_id)) GeneUniverse_entrezid
ChIPseeker
CpGs are now annotated to their associated genes and genomic regions
using the annotatePeaks
function of the
ChIPseeker
R package, using a TxDb object. The GenomicFeatures
package uses TxDb
objects to store transcript metadata.
This class maps the 5’ and 3’ untranslated regions (UTRs), protein
coding sequences (CDSs) and exons for a set of mRNA transcripts to their
associated genome. All TxDb
objects are backed by a SQLite
database that manages genomic locations and the relationships between
pre-processed mRNA transcripts, exons, protein coding sequences, and
their related gene identifiers.
TxDb created by us
Remember that bsseq_obj_escapees (and bsseq_obj_NonEscapees) have 1-based positions
bsseq_obj_escapees## Loading required package: bsseq
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## An object of type 'BSseq' with
## 9113 methylation loci
## 14 samples
## has not been smoothed
## All assays are in-memory
granges(bsseq_obj_escapees)
## GRanges object with 9113 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 2155089 *
## [2] chr1 2155133 *
## [3] chr1 2812643 *
## [4] chr1 2812659 *
## [5] chr1 2812704 *
## ... ... ... ...
## [9109] chrX 142205308 *
## [9110] chrX 147279247 *
## [9111] chrX 147279251 *
## [9112] chrX 147279260 *
## [9113] chrX 154807851 *
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
<- annotatePeak(
EscapeeAnno granges(bsseq_obj_escapees),
tssRegion = c(-3000, 3000),
TxDb = txdb_v35,
level = "transcript", #level = "gene"
assignGenomicAnnotation = TRUE,
genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
"Downstream", "Intergenic"),
overlap = "TSS",
verbose = TRUE,
columns = c("ENTREZID", "ENSEMBL", "SYMBOL", "GENENAME")
)## >> preparing features information... 2025-01-28 04:07:26 PM
## >> identifying nearest features... 2025-01-28 04:07:30 PM
## >> calculating distance from peak to TSS... 2025-01-28 04:07:31 PM
## >> assigning genomic annotation... 2025-01-28 04:07:31 PM
## >> assigning chromosome lengths 2025-01-28 04:08:00 PM
## >> done... 2025-01-28 04:08:00 PM
::plotAnnoBar(EscapeeAnno) ChIPseeker
::plotAnnoPie(EscapeeAnno) ChIPseeker
::vennpie(EscapeeAnno) ChIPseeker
::plotDistToTSS(EscapeeAnno) ChIPseeker
From ensembl ids to gene symbols, entrez ids and metadata
<- "https://aug2020.archive.ensembl.org"
Host <- "hsapiens"
Specie <- biomaRt::listMarts(host=Host)[1,1]
BioMart <- biomaRt::listMarts(host=Host)[1,2]
Version <- biomaRt::useMart(host=Host, biomart=BioMart, version=Version,
Mart dataset=paste0(Specie,'_gene_ensembl'))
= unique(as.data.frame(EscapeeAnno)$geneId) #Genes assigned to the escapees regions by chipseeker
EscapeeGenes_ensembl
= c("ensembl_gene_id_version", "ensembl_gene_id", "hgnc_symbol", "gene_biotype")
Attributes
<- biomaRt::getBM(mart=Mart, filters='ensembl_gene_id_version', values=EscapeeGenes_ensembl, attributes=Attributes, uniqueRows=TRUE)
EscapeeGeneAnnotation <- EscapeeGeneAnnotation %>% dplyr::distinct() #%>%filter(!is.na(hgnc_symbol)) %>% filter(!hgnc_symbol%in%"") EscapeeGeneAnnotation
!EscapeeGenes_ensembl %in% EscapeeGeneAnnotation$ensembl_gene_id_version] %>% head() #ChipSeeker annotation that are not present in the biomart
EscapeeGenes_ensembl[## character(0)
= EscapeeGeneAnnotation %>% filter(!is.na(hgnc_symbol)) %>% filter(!hgnc_symbol%in%"") %>% pull(hgnc_symbol) %>% unique()
EscapeeGenes_symbol
= mapIds(org.Hs.eg.db, keys = EscapeeGenes_symbol, column = "ENTREZID", keytype = "SYMBOL") %>% unique()
EscapeeGenes_entrez ## 'select()' returned 1:1 mapping between keys and columns
<- dplyr::left_join(as.data.frame(EscapeeAnno), EscapeeGeneAnnotation, by=c('geneId' = 'ensembl_gene_id_version')) AnnotatedEscapees
<- topGOGeneVectors_meth_v2(gene_symbols = EscapeeGenes_symbol, genomic_type = "all", gene_type = "all", Universe = GeneUniverse_symbol) %>% unlist()
GeneVectors ## [1] "All genomic regions and gene types will be kept"
## [1] "There are some genes symbols which are not in the gene universe! They will not be considered..."
Genes associated to escapee CpGs that are not present in the universe:
<- EscapeeGeneAnnotation %>% filter(hgnc_symbol %in% unique(na.omit(EscapeeGenes_symbol))[!unique(na.omit(EscapeeGenes_symbol)) %in% GeneUniverse_symbol])
not_present not_present
## ensembl_gene_id_version ensembl_gene_id hgnc_symbol gene_biotype
## 1 ENSG00000278970.2 ENSG00000278970 HEIH lncRNA
These are the CpGs annotated with those genes
@anno %>% as.data.frame() %>% filter(geneId %in% not_present$ensembl_gene_id_version)
EscapeeAnno## seqnames start end width strand annotation geneChr geneStart
## 1 chr5 180830679 180830679 1 * Promoter (<=1kb) 5 180826871
## 2 chr5 180830691 180830691 1 * Promoter (<=1kb) 5 180826871
## 3 chr5 180830697 180830697 1 * Promoter (<=1kb) 5 180826871
## 4 chr5 180830795 180830795 1 * Promoter (<=1kb) 5 180826871
## 5 chr5 180830797 180830797 1 * Promoter (<=1kb) 5 180826871
## geneEnd geneLength geneStrand geneId transcriptId
## 1 180830659 3789 2 ENSG00000278970.2 ENST00000651563.1
## 2 180830659 3789 2 ENSG00000278970.2 ENST00000651563.1
## 3 180830659 3789 2 ENSG00000278970.2 ENST00000651563.1
## 4 180830659 3789 2 ENSG00000278970.2 ENST00000651563.1
## 5 180830659 3789 2 ENSG00000278970.2 ENST00000651563.1
## distanceToTSS
## 1 -20
## 2 -32
## 3 -38
## 4 -136
## 5 -138
<- ifelse(length(grep('BP', params$TopGO))!=0, TRUE, FALSE)
BpEval <- ifelse(length(grep('MF', params$TopGO))!=0, TRUE, FALSE)
MfEval <- ifelse(length(grep('CC', params$TopGO))!=0, TRUE, FALSE) CcEval
Therefore:
On the basis of the analysis settings, the enrichment for Biological Process IS performed.
# I generate a list that contains the association between each gene and the GO terms that are associated to it
<- topGO::annFUN.org(whichOnto="BP", feasibleGenes=names(GeneVectors),
BPann mapping="org.Hs.eg.db", ID="symbol") %>% inverseList()
# Wrapper function for topGO analysis
<- topGOResults_new(Genes=GeneVectors, gene2GO=BPann, ontology='BP',
ResBPAll desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=12, geneTh=4,
saveRes=TRUE, outDir=paste0(OutputFolder), fileName='BPAll', maxAnn = 500)
## Gene vector contains levels: 0,1
##
## Building most specific GOs .....
## ( 12544 GO terms found. )
##
## Build GO DAG topology ..........
## ( 15945 GO terms and 36197 relations. )
##
## Annotating nodes ...............
## ( 18089 genes annotated to the GO terms. )
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 4417 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 18: 2 nodes to be scored (0 eliminated genes)
##
## Level 17: 4 nodes to be scored (0 eliminated genes)
##
## Level 16: 9 nodes to be scored (45 eliminated genes)
##
## Level 15: 20 nodes to be scored (101 eliminated genes)
##
## Level 14: 43 nodes to be scored (264 eliminated genes)
##
## Level 13: 75 nodes to be scored (581 eliminated genes)
##
## Level 12: 137 nodes to be scored (1490 eliminated genes)
##
## Level 11: 289 nodes to be scored (3905 eliminated genes)
##
## Level 10: 477 nodes to be scored (6204 eliminated genes)
##
## Level 9: 628 nodes to be scored (8193 eliminated genes)
##
## Level 8: 679 nodes to be scored (10958 eliminated genes)
##
## Level 7: 726 nodes to be scored (13312 eliminated genes)
##
## Level 6: 620 nodes to be scored (15493 eliminated genes)
##
## Level 5: 383 nodes to be scored (16686 eliminated genes)
##
## Level 4: 216 nodes to be scored (17396 eliminated genes)
##
## Level 3: 90 nodes to be scored (17691 eliminated genes)
##
## Level 2: 18 nodes to be scored (17829 eliminated genes)
##
## Level 1: 1 nodes to be scored (17896 eliminated genes)
=20
nterms<- c(All='forestgreen', Down='blue', Up='red')
cols <- lapply(cols, {function(el) grDevices::colorRampPalette(c('gold', el))(nterms)}) pals
if(nrow(ResBPAll$ResSel)>0){
topGOBarplot_new(TopGORes=ResBPAll$ResSel, terms=nterms, pvalTh=0.01, plotTitle=paste0("All Escapees genes related BP terms"),
palette=pals$All, flip_x=FALSE)
}
if(nrow(ResBPAll$ResSel)>0){
bubbleplot(ResBPAll$ResSel, terms=nterms, Ont = "BP", pvalTh=0.01, plotTitle="All Escapees genes related BP terms")
}## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
if(nrow(ResBPAll$ResSel)>0){
<- TableGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms=nterms)
table_GenesInTerm_BPAll
}
%>% dplyr::select(-c(Score, GOid)) %>%
table_GenesInTerm_BPAll datatable(class = 'hover', rownames = FALSE, extension='Buttons', escape = FALSE, caption = paste0("All genes in top ", nterms, " BP terms"), options = list(pageLength=10, dom='Bfrtip', autoWidth=TRUE, buttons=list(c('csv', 'excel'))))
On the basis of the analysis settings, the enrichment for Molecular Function IS performed.
# I generate a list that contains the association between each gene and the GO terms that are associated to it
<- topGO::annFUN.org(whichOnto="MF", feasibleGenes=names(GeneVectors),
MFann mapping="org.Hs.eg.db", ID="symbol") %>% inverseList()
# Wrapper function for topGO analysis
<- topGOResults_new(Genes=GeneVectors, gene2GO=MFann, ontology='MF',
ResMFAll desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=12, geneTh=4,
saveRes=TRUE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)
## Gene vector contains levels: 0,1
##
## Building most specific GOs .....
## ( 4502 GO terms found. )
##
## Build GO DAG topology ..........
## ( 4948 GO terms and 6418 relations. )
##
## Annotating nodes ...............
## ( 17822 genes annotated to the GO terms. )
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 766 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 11: 7 nodes to be scored (0 eliminated genes)
##
## Level 10: 12 nodes to be scored (0 eliminated genes)
##
## Level 9: 34 nodes to be scored (205 eliminated genes)
##
## Level 8: 60 nodes to be scored (1511 eliminated genes)
##
## Level 7: 100 nodes to be scored (3866 eliminated genes)
##
## Level 6: 150 nodes to be scored (4653 eliminated genes)
##
## Level 5: 175 nodes to be scored (6769 eliminated genes)
##
## Level 4: 160 nodes to be scored (10746 eliminated genes)
##
## Level 3: 51 nodes to be scored (13969 eliminated genes)
##
## Level 2: 16 nodes to be scored (15191 eliminated genes)
##
## Level 1: 1 nodes to be scored (17628 eliminated genes)
=20
nterms<- c(All='forestgreen', Down='blue', Up='red')
cols <- lapply(cols, {function(el) grDevices::colorRampPalette(c('gold', el))(nterms)}) pals
if(nrow(ResMFAll$ResSel)>0){
topGOBarplot_new(TopGORes=ResMFAll$ResSel, terms=nterms, pvalTh=0.01, plotTitle=paste0("All Escapees genes related MF terms"),
palette=pals$All, flip_x=FALSE)
}
if(nrow(ResMFAll$ResSel)>0){
bubbleplot(ResMFAll$ResSel, terms=nterms, Ont = "MF", pvalTh=0.01, plotTitle="All Escapees genes related MF terms")
}
if(nrow(ResMFAll$ResSel)>0){
<- TableGenesInTerm(ResMFAll$ResSel, ResMFAll$GOdata, nterms=nterms)
table_GenesInTerm_MFAll
}
%>% dplyr::select(-c(Score, GOid)) %>%
table_GenesInTerm_MFAll datatable(class = 'hover', rownames = FALSE, extension='Buttons', escape = FALSE, caption = paste0("All genes in top ", nterms, " MF terms"), options = list(pageLength=10, dom='Bfrtip', autoWidth=TRUE, buttons=list(c('csv', 'excel'))))
On the basis of the analysis settings, the enrichment for Cellular Component IS performed.
# I generate a list that contains the association between each gene and the GO terms that are associated to it
<- topGO::annFUN.org(whichOnto="CC", feasibleGenes=names(GeneVectors),
CCann mapping="org.Hs.eg.db", ID="symbol") %>% inverseList()
# Wrapper function for topGO analysis
<- topGOResults_new(Genes=GeneVectors, gene2GO=CCann, ontology='CC',
ResCCAll desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=12, geneTh=4,
saveRes=TRUE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)
## Gene vector contains levels: 0,1
##
## Building most specific GOs .....
## ( 1828 GO terms found. )
##
## Build GO DAG topology ..........
## ( 2020 GO terms and 3396 relations. )
##
## Annotating nodes ...............
## ( 18985 genes annotated to the GO terms. )
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 543 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 11: 17 nodes to be scored (0 eliminated genes)
##
## Level 10: 53 nodes to be scored (0 eliminated genes)
##
## Level 9: 77 nodes to be scored (793 eliminated genes)
##
## Level 8: 80 nodes to be scored (3062 eliminated genes)
##
## Level 7: 82 nodes to be scored (5871 eliminated genes)
##
## Level 6: 81 nodes to be scored (10292 eliminated genes)
##
## Level 5: 67 nodes to be scored (12598 eliminated genes)
##
## Level 4: 45 nodes to be scored (15415 eliminated genes)
##
## Level 3: 38 nodes to be scored (17747 eliminated genes)
##
## Level 2: 2 nodes to be scored (18615 eliminated genes)
##
## Level 1: 1 nodes to be scored (18870 eliminated genes)
=20
nterms<- c(All='forestgreen', Down='blue', Up='red')
cols <- lapply(cols, {function(el) grDevices::colorRampPalette(c('gold', el))(nterms)}) pals
if(nrow(ResCCAll$ResSel)>0){
topGOBarplot_new(TopGORes=ResCCAll$ResSel, terms=nterms, pvalTh=0.01, plotTitle=paste0("All Escapees genes related CC terms"),
palette=pals$All, flip_x=FALSE)
}
if(nrow(ResCCAll$ResSel)>0){
bubbleplot(ResCCAll$ResSel, terms=nterms, Ont = "CC", pvalTh=0.01, plotTitle="All Escapees genes related CC terms")
}
if(nrow(ResCCAll$ResSel)>0){
<- TableGenesInTerm(ResCCAll$ResSel, ResCCAll$GOdata, nterms=nterms)
table_GenesInTerm_CCAll
}
%>% dplyr::select(-c(Score, GOid)) %>%
table_GenesInTerm_CCAll datatable(class = 'hover', rownames = FALSE, extension='Buttons', escape = FALSE, caption = paste0("All genes in top ", nterms, " CC terms"), options = list(pageLength=10, dom='Bfrtip', autoWidth=TRUE, buttons=list(c('csv', 'excel'))))
save.image(paste0(OutputFolder, "AnnotationAndFunctionalEnv.RData"))
saveRDS(AnnotatedEscapees, paste0(OutputFolder, "AnnotatedEscapees.rds"))
<- file(paste0(OutputFolder, "EscapeeGenes_ensembl.txt"), "w")
file_conn for (element in gsub(x = EscapeeGenes_ensembl, pattern = "\\..*", replacement = "")) {
cat(element, "\n", file = file_conn)
}close(file_conn)
<- file(paste0(OutputFolder, "EscapeeGenes_symbol.txt"), "w")
file_conn for (element in EscapeeGenes_symbol) {
cat(element, "\n", file = file_conn)
}close(file_conn)
<- file(paste0(OutputFolder, "GeneUniverse_symbol.txt"), "w")
file_conn for (element in GeneUniverse_symbol) {
cat(element, "\n", file = file_conn)
}close(file_conn)
<- sessionInfo()
SessionInfo <- date() Date
Date## [1] "Tue Jan 28 16:12:15 2025"
SessionInfo## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] bsseq_1.34.0 SummarizedExperiment_1.28.0
## [3] MatrixGenerics_1.10.0 matrixStats_1.0.0
## [5] GenomicRanges_1.50.2 GenomeInfoDb_1.34.9
## [7] topGO_2.50.0 SparseM_1.81
## [9] GO.db_3.16.0 graph_1.76.0
## [11] viridis_0.6.2 viridisLite_0.4.2
## [13] RColorBrewer_1.1-3 data.table_1.14.8
## [15] DT_0.28 org.Hs.eg.db_3.16.0
## [17] AnnotationDbi_1.60.2 IRanges_2.32.0
## [19] S4Vectors_0.36.2 Biobase_2.58.0
## [21] BiocGenerics_0.44.0 ChIPseeker_1.34.1
## [23] gridExtra_2.3 scales_1.2.1
## [25] dplyr_1.1.2 plotly_4.10.2
## [27] ggplot2_3.4.2
##
## loaded via a namespace (and not attached):
## [1] shadowtext_0.1.2
## [2] fastmatch_1.1-3
## [3] BiocFileCache_2.6.1
## [4] plyr_1.8.8
## [5] igraph_1.5.0
## [6] lazyeval_0.2.2
## [7] splines_4.2.1
## [8] crosstalk_1.2.0
## [9] BiocParallel_1.32.6
## [10] digest_0.6.33
## [11] yulab.utils_0.0.6
## [12] htmltools_0.5.5
## [13] GOSemSim_2.24.0
## [14] fansi_1.0.4
## [15] magrittr_2.0.3
## [16] memoise_2.0.1
## [17] BSgenome_1.66.3
## [18] limma_3.54.2
## [19] Biostrings_2.66.0
## [20] graphlayouts_0.8.4
## [21] R.utils_2.12.2
## [22] enrichplot_1.18.4
## [23] prettyunits_1.1.1
## [24] colorspace_2.1-0
## [25] blob_1.2.4
## [26] rappdirs_0.3.3
## [27] ggrepel_0.9.3
## [28] xfun_0.39
## [29] crayon_1.5.2
## [30] RCurl_1.98-1.12
## [31] jsonlite_1.8.7
## [32] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [33] scatterpie_0.1.8
## [34] ape_5.7-1
## [35] glue_1.6.2
## [36] polyclip_1.10-4
## [37] gtable_0.3.3
## [38] zlibbioc_1.44.0
## [39] XVector_0.38.0
## [40] DelayedArray_0.24.0
## [41] Rhdf5lib_1.20.0
## [42] HDF5Array_1.26.0
## [43] DOSE_3.24.2
## [44] DBI_1.1.3
## [45] Rcpp_1.0.11
## [46] plotrix_3.8-2
## [47] progress_1.2.2
## [48] gridGraphics_0.5-1
## [49] tidytree_0.4.2
## [50] bit_4.0.5
## [51] htmlwidgets_1.6.2
## [52] httr_1.4.6
## [53] fgsea_1.24.0
## [54] gplots_3.1.3
## [55] ellipsis_0.3.2
## [56] R.methodsS3_1.8.2
## [57] pkgconfig_2.0.3
## [58] XML_3.99-0.14
## [59] farver_2.1.1
## [60] sass_0.4.7
## [61] dbplyr_2.3.3
## [62] locfit_1.5-9.7
## [63] utf8_1.2.3
## [64] labeling_0.4.2
## [65] ggplotify_0.1.0
## [66] tidyselect_1.2.0
## [67] rlang_1.1.1
## [68] reshape2_1.4.4
## [69] munsell_0.5.0
## [70] tools_4.2.1
## [71] cachem_1.0.8
## [72] cli_3.6.1
## [73] generics_0.1.3
## [74] RSQLite_2.3.1
## [75] evaluate_0.21
## [76] stringr_1.5.0
## [77] fastmap_1.1.1
## [78] yaml_2.3.7
## [79] ggtree_3.6.2
## [80] knitr_1.43
## [81] bit64_4.0.5
## [82] tidygraph_1.2.3
## [83] caTools_1.18.2
## [84] purrr_1.0.1
## [85] KEGGREST_1.38.0
## [86] ggraph_2.1.0
## [87] sparseMatrixStats_1.10.0
## [88] nlme_3.1-162
## [89] R.oo_1.25.0
## [90] aplot_0.1.10
## [91] xml2_1.3.5
## [92] biomaRt_2.54.1
## [93] compiler_4.2.1
## [94] rstudioapi_0.15.0
## [95] filelock_1.0.2
## [96] curl_5.0.1
## [97] png_0.1-8
## [98] treeio_1.23.1
## [99] tibble_3.2.1
## [100] tweenr_2.0.2
## [101] bslib_0.5.0
## [102] stringi_1.7.12
## [103] highr_0.10
## [104] GenomicFeatures_1.50.4
## [105] lattice_0.21-8
## [106] Matrix_1.6-0
## [107] permute_0.9-7
## [108] vctrs_0.6.3
## [109] rhdf5filters_1.10.1
## [110] pillar_1.9.0
## [111] lifecycle_1.0.3
## [112] jquerylib_0.1.4
## [113] cowplot_1.1.1
## [114] bitops_1.0-7
## [115] patchwork_1.1.2
## [116] rtracklayer_1.58.0
## [117] qvalue_2.30.0
## [118] R6_2.5.1
## [119] BiocIO_1.8.0
## [120] KernSmooth_2.23-22
## [121] codetools_0.2-19
## [122] gtools_3.9.4
## [123] boot_1.3-28.1
## [124] MASS_7.3-60
## [125] rhdf5_2.42.1
## [126] rjson_0.2.21
## [127] withr_2.5.0
## [128] GenomicAlignments_1.34.1
## [129] Rsamtools_2.14.0
## [130] GenomeInfoDbData_1.2.9
## [131] parallel_4.2.1
## [132] hms_1.1.3
## [133] grid_4.2.1
## [134] ggfun_0.0.9
## [135] tidyr_1.3.0
## [136] HDO.db_0.99.1
## [137] DelayedMatrixStats_1.20.0
## [138] rmarkdown_2.23
## [139] ggforce_0.4.1
## [140] restfulr_0.0.15