for (i in 1:length(params))
print(paste('Parameter:', names(params)[i], ' - Value:', params[[i]], '- Class:', class(params[[i]])))
## [1] "Parameter: Dataset - Value: FetalChronic - Class: character"
## [1] "Parameter: topTagFile - Value: Data/AllSEcorrected.RData - Class: character"
## [1] "Parameter: FDRTr - Value: 0.05 - Class: numeric"
## [1] "Parameter: LogFCTr - Value: 0.5 - Class: numeric"
## [1] "Parameter: OutputFolder - Value: Data/FunctionalAnalysis/FetalChronicMixN/ - Class: character"
## [1] "Parameter: GOSeq - Value: Yes - Class: character"
## [1] "Parameter: TopGO - Value: Yes - Class: character"
## [1] "Parameter: Camera - Value: Yes - Class: character"
## [1] "Parameter: Gsea - Value: Yes - Class: character"
## [1] "Parameter: GseaPvalSel - Value: 0.05 - Class: numeric"
<- params$Dataset
Dataset <- params$LogFCTr
LogFCTh <- params$FDRTr
FDRTh <- ifelse(is.null(params$OutputFolder), getwd(), params$OutputFolder)
OutputFolder
if (dir.exists(OutputFolder) == FALSE) {
dir.create(OutputFolder, recursive=FALSE)
}
if (dir.exists(paste0(OutputFolder, 'TopGO')) == FALSE) {
dir.create(paste0(OutputFolder, 'TopGO'), recursive=FALSE)
}
if (dir.exists(paste0(OutputFolder, 'Gsea')) == FALSE) {
dir.create(paste0(OutputFolder, 'Gsea'), recursive=FALSE)
}
library(stringr)
library(DT)
library(ggplot2)
library(ggrepel)
library(AnnotationDbi)
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
library(org.Hs.eg.db)
##
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
library(RColorBrewer)
library(topGO)
## Loading required package: graph
##
## Attaching package: 'graph'
## The following object is masked from 'package:stringr':
##
## boundary
## Loading required package: GO.db
##
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## groupGOTerms: GOBPTerm, GOMFTerm, GOCCTerm environments built.
##
## Attaching package: 'topGO'
## The following object is masked from 'package:IRanges':
##
## members
library(fgsea)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
##
## expand
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:graph':
##
## union
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
options(stringsAsFactors=FALSE)
source('Functions/EDC_Functions.R')
source('Functions/CriFormatted.R')
source('Functions/goseq.R')
<- gmtPathways('Data/MolecularSignatureDatabase/H/h.all.v7.0.symbols.gmt')
H1 <- gmtPathways('Data/MolecularSignatureDatabase/C2/KEGG/c2.cp.kegg.v7.0.symbols.gmt') Kegg
I upload the result if differential expression analysis after SVA correction. I select DEGs as having an FDR < 0.05 (anova-like FDR) and a log2FC > 0.5 either in Mix 1X or Mix 1000X. I also calculate the mean fold-change in the two treatments.
load(params$topTagFile, verbose=TRUE)
## Loading objects:
## SEs
## DEAs
<- list()
Res $tableRaw <- DEAs$chronic.fetal
Res$table <- DEAs$chronic.fetal
Res$table$genes <- row.names(Res$table)
Res$table$logFC <- (Res$table$logFC.EXPO1X + Res$table$logFC.EXPO1000X)/2
Res$table$GeneName <- row.names(Res$table) # I generated this column to make the dataframe compatible with some functions
Res$degs <- dplyr::filter(Res$table, FDR < FDRTh & (abs(logFC.EXPO1X) > LogFCTh | abs(logFC.EXPO1000X) > LogFCTh) & logCPM >0)
Resdim(Res$degs)
## [1] 132 9
14017 genes have been testes for differential expression.
Imposing a threshold of 1.4142136 on the FC and 0.05 on the FDR (as specified in parameters), 132 genes are selected.
Gene ontology enrichment analysis is performed using TopGO with Fisher statistics and weight01 algorithm. The division between up-regulated and down-regulated genes is done on the mean FC between the two treatments.
I generate vectors for the gene universe, all modulated genes, up-regulated genes and down-regulated genes in the format required by TopGO.
<- Res$table %>% dplyr::pull(GeneName)
UniverseGenes <- Res$degs %>% dplyr::pull(GeneName)
DEG <- Res$degs %>% dplyr::filter(logFC > 0) %>% dplyr::pull(GeneName)
DEGUp <- Res$degs %>% dplyr::filter(logFC < 0) %>% dplyr::pull(GeneName)
DEGDown
# generation of named vectors in the format required by TopGO
<- list()
GeneVectors $DEGenes <- factor(as.integer(UniverseGenes%in%DEG))
GeneVectorsnames(GeneVectors$DEGenes) <- UniverseGenes
$DEGenesDown <- factor(as.integer(UniverseGenes%in%DEGDown))
GeneVectorsnames(GeneVectors$DEGenesDown) <- UniverseGenes
$DEGenesUp <- factor(as.integer(UniverseGenes%in%DEGUp))
GeneVectorsnames(GeneVectors$DEGenesUp) <- UniverseGenes
Therefore:
<- ifelse(params$TopGO=='Yes', TRUE, FALSE)
BpEval # the analysis is not done if the number of DEGs (all, down-reg or up-reg) is lower than 2.
<- ifelse(table(GeneVectors$DEGenes)['1'] < 2 | is.na(table(GeneVectors$DEGenes)['1']) | table(GeneVectors$DEGenesDown)['1'] < 2 | is.na(table(GeneVectors$DEGenesDown)['1']) | table(GeneVectors$DEGenesUp)['1'] < 2 | is.na(table(GeneVectors$DEGenesUp)['1']), FALSE, BpEval) BpEval
On the basis of the analysis settings or the number of differentially expressed genes, TopGO analysis IS performed.
# I generate a list that contains the association between each gene and the GO terms that are associated to it
<- topGO::annFUN.org(whichOnto="BP", feasibleGenes=names(GeneVectors$DEGenes), mapping="org.Hs.eg.db", ID="symbol") %>% inverseList()
BPann
# Wrapper function for topGO analysis (see helper file)
<- topGOResults(Genes=GeneVectors$DEGenes, gene2GO=BPann, ontology='BP', description=NULL, nodeSize=10, algorithm='weight01', statistic='fisher', EnTh=2, PvalTh=0.01, minTerms=10)
ResBPAll ##
## Building most specific GOs .....
## ( 10775 GO terms found. )
##
## Build GO DAG topology ..........
## ( 14648 GO terms and 33956 relations. )
##
## Annotating nodes ...............
## ( 11471 genes annotated to the GO terms. )
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 2566 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 17: 1 nodes to be scored (0 eliminated genes)
##
## Level 16: 4 nodes to be scored (0 eliminated genes)
##
## Level 15: 11 nodes to be scored (20 eliminated genes)
##
## Level 14: 21 nodes to be scored (73 eliminated genes)
##
## Level 13: 46 nodes to be scored (232 eliminated genes)
##
## Level 12: 83 nodes to be scored (814 eliminated genes)
##
## Level 11: 138 nodes to be scored (2684 eliminated genes)
##
## Level 10: 217 nodes to be scored (4281 eliminated genes)
##
## Level 9: 309 nodes to be scored (5446 eliminated genes)
##
## Level 8: 360 nodes to be scored (6952 eliminated genes)
##
## Level 7: 433 nodes to be scored (8285 eliminated genes)
##
## Level 6: 407 nodes to be scored (9544 eliminated genes)
##
## Level 5: 277 nodes to be scored (10504 eliminated genes)
##
## Level 4: 167 nodes to be scored (11034 eliminated genes)
##
## Level 3: 73 nodes to be scored (11207 eliminated genes)
##
## Level 2: 18 nodes to be scored (11289 eliminated genes)
##
## Level 1: 1 nodes to be scored (11349 eliminated genes)
write.table(ResBPAll$ResAll, file=paste0(OutputFolder, 'TopGO/BPAllResults.txt'), sep='\t', row.names=FALSE)
# Wrapper function for topGO analysis (see helper file)
<- topGOResults(Genes=GeneVectors$DEGenesDown, gene2GO=BPann, ontology='BP', description=NULL, nodeSize=10, algorithm='weight01', statistic='fisher', EnTh=2, PvalTh=0.01, minTerms=10)
ResBPDown ##
## Building most specific GOs .....
## ( 10775 GO terms found. )
##
## Build GO DAG topology ..........
## ( 14648 GO terms and 33956 relations. )
##
## Annotating nodes ...............
## ( 11471 genes annotated to the GO terms. )
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 2193 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 17: 1 nodes to be scored (0 eliminated genes)
##
## Level 16: 4 nodes to be scored (0 eliminated genes)
##
## Level 15: 11 nodes to be scored (20 eliminated genes)
##
## Level 14: 19 nodes to be scored (73 eliminated genes)
##
## Level 13: 36 nodes to be scored (232 eliminated genes)
##
## Level 12: 63 nodes to be scored (789 eliminated genes)
##
## Level 11: 108 nodes to be scored (2475 eliminated genes)
##
## Level 10: 174 nodes to be scored (4067 eliminated genes)
##
## Level 9: 257 nodes to be scored (5119 eliminated genes)
##
## Level 8: 297 nodes to be scored (6450 eliminated genes)
##
## Level 7: 372 nodes to be scored (7892 eliminated genes)
##
## Level 6: 360 nodes to be scored (9423 eliminated genes)
##
## Level 5: 253 nodes to be scored (10392 eliminated genes)
##
## Level 4: 149 nodes to be scored (11013 eliminated genes)
##
## Level 3: 70 nodes to be scored (11196 eliminated genes)
##
## Level 2: 18 nodes to be scored (11286 eliminated genes)
##
## Level 1: 1 nodes to be scored (11341 eliminated genes)
# Selection on enrichment of at least 2 is implemented (also to avoid depleted categories). Then categories are ranked by PVal and all the ones with Pval < th are selected. If the number is < minTerms, othter terms are included to reach the minimum number.
# Results are shown in an interactive table
<- 'http://amigo.geneontology.org/amigo/term/'
searchURL
$ResSel %>%
ResBPDown::mutate(GOLink=paste0('<a href="', searchURL, GO.ID, '">', GO.ID, '</a>')) %>%
dplyr::select(8, 2, 4, 5, 7, 6) %>% # selection of columns to be shown
dplyr::filter(as.numeric(Statistics) < 0.01) %>% # only significant terms are shown in the table
dplyr::datatable(class = 'hover', rownames = FALSE, caption='Down-regulated genes: Biological Process Gene Ontology Enrichment', filter='top', options = list(pageLength = 5, autoWidth = TRUE), escape=FALSE) DT
# Wrapper function for topGO analysis (see helper file)
<- topGOResults(Genes=GeneVectors$DEGenesUp, gene2GO=BPann, ontology='BP', description=NULL, nodeSize=10, algorithm='weight01', statistic='fisher', EnTh=2, PvalTh=0.01, minTerms=10)
ResBPUp ##
## Building most specific GOs .....
## ( 10775 GO terms found. )
##
## Build GO DAG topology ..........
## ( 14648 GO terms and 33956 relations. )
##
## Annotating nodes ...............
## ( 11471 genes annotated to the GO terms. )
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 1210 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 14: 4 nodes to be scored (0 eliminated genes)
##
## Level 13: 16 nodes to be scored (0 eliminated genes)
##
## Level 12: 33 nodes to be scored (348 eliminated genes)
##
## Level 11: 50 nodes to be scored (2027 eliminated genes)
##
## Level 10: 76 nodes to be scored (3584 eliminated genes)
##
## Level 9: 114 nodes to be scored (4599 eliminated genes)
##
## Level 8: 151 nodes to be scored (5692 eliminated genes)
##
## Level 7: 187 nodes to be scored (7080 eliminated genes)
##
## Level 6: 213 nodes to be scored (8513 eliminated genes)
##
## Level 5: 188 nodes to be scored (10041 eliminated genes)
##
## Level 4: 112 nodes to be scored (10837 eliminated genes)
##
## Level 3: 51 nodes to be scored (11171 eliminated genes)
##
## Level 2: 14 nodes to be scored (11269 eliminated genes)
##
## Level 1: 1 nodes to be scored (11346 eliminated genes)
$ResSel %>%
ResBPUp::mutate(GOLink=paste0('<a href="', searchURL, GO.ID, '">', GO.ID, '</a>')) %>%
dplyr::select(8, 2, 4, 5, 7, 6) %>% # selection of columns to be shown
dplyr::filter(as.numeric(Statistics) < 0.01) %>% # only significant terms are shown in the table
dplyr::datatable(class = 'hover', rownames = FALSE, caption='Up-regulated genes: Biological Process Gene Ontology Enrichment', filter='top', options = list(pageLength = 5, autoWidth = TRUE), escape=FALSE) DT
# the if clauses avoids an error in case there is one empty category
if(dim(ResBPUp$ResSel)[1] > 0 & dim(ResBPDown$ResSel)[1] > 0 & dim(ResBPAll$ResSel)[1] > 0){
<- topGOBarplotAll(TopGOResAll=ResBPAll$ResSel, TopGOResDown=ResBPDown$ResSel, TopGOResUp=ResBPUp$ResSel, terms=8, pvalTh=0.01, title=NULL)
TopGOBar
TopGOBar
ggsave(filename=paste0(OutputFolder, '/TopGO/Barplot.pdf'), TopGOBar,
width=13, height=8)
}
<- ifelse(params$GOSeq=='Yes', TRUE, FALSE) GOSeqEval
Gene ontology enrichment analysis is performed using GOSeq approach. On the basis of the analysis settings, GOSeq analysis IS performed.
<- goseq.enrichment(names(GeneVectors$DEGenes), names(GeneVectors$DEGenes)[GeneVectors$DEGenes==1], "GO:BP", cutoff=0.05)
GOSeq ## Loading hg19 length data...
#colnames(GOSeq) <- c("GO.ID", "Term", "Annotated", "Significant", "enrichment", "PValue", "FDR", "genes")
if(dim(GOSeq)[1] > 0) {
<- goTreemap(GOSeq, removeParents=FALSE)
gotr # to avoid errors if no results are found
} ## Loading required package: treemap
## Preparing ancestors
## Warning in if (class(try(col2rgb(bg.labels), silent = TRUE)) == "try-error")
## stop("Invalid bg.labels"): the condition has length > 1 and only the first
## element will be used
<- ifelse(params$Camera=='Yes', TRUE, FALSE) CameraEval
On the basis of the analysis settings, Camera analysis IS performed.
<- function(x){
printTable $genes <- NULL
xdatatable(x, filter="top", extensions = 'Buttons',
options = list( dom = 'Bfrtip', buttons = 'csv')
)
}
<- Res$table %>% dplyr::mutate(StatSign=F*sign(rowMeans(Res$tableRaw[,grep('logFC\\.',colnames(Res$tableRaw))])))
ResCamera row.names(ResCamera) <- ResCamera$genes
printTable(cameraWrapper(dea=ResCamera, gsets=NULL, addDEgenes=TRUE, dea.thres=0.05, minG=5, reportMax=500))
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
<- ifelse(params$Gsea=='Yes', TRUE, FALSE) GseaEval
On the basis of the analysis settings, GSEA analysis IS performed.
Rank for stat after selecting only genes with PValues < 0.05. StatSign has been calculated as for camera analysis: F value multiplied for the sign of the mean FC. N.B. fgsea sort the vector in decreasing order.
<- rankGeneVector(ResCamera, Order='StatSign', PValSel=params$GseaPvalSel)
FStat length(FStat)
## [1] 3684
head(FStat)
## CD74 RPS15A MYEOV2 RPL27 ZBED1 SNRPG
## 36.47454 34.65894 32.36238 32.34142 31.39846 30.99296
tail(FStat)
## TRPM8 LRRTM3 FGFBP2 RNF145 NRP1 BCAT1
## -41.67587 -47.54655 -50.51338 -53.23050 -69.70240 -86.09176
print(paste('Fgsea analysis is performed on', length(FStat), 'genes having PValue <', params$GseaPvalSel, 'and ranked according to signed F statistics'))
## [1] "Fgsea analysis is performed on 3684 genes having PValue < 0.05 and ranked according to signed F statistics"
fgsea analysis: H1 pathways
set.seed(42)
<- fgsea(FStat, pathways=H1, nperm=75000, maxSize=500, minSize=10, nproc=1)
GseaSelH1 ## Warning in fgsea(FStat, pathways = H1, nperm = 75000, maxSize = 500,
## minSize = 10, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.03% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
::fwrite(GseaSelH1, file=paste0(OutputFolder, '/Gsea/GseaSelH1.txt'), sep="\t", sep2=c("", " ", ""))
data.table
<- dplyr::filter(GseaSelH1, padj < 0.1 & abs(NES) > 2) %>%
GseaResH1 ::mutate(LEGenes=stringr::str_count(leadingEdge, ",")+1) %>%
dplyr::mutate(GeneSet='H1') %>%
dplyr::select(1, 2, 3, 5, 9, 10)
dplyr## Warning in stri_count_regex(string, pattern, opts_regex = opts(pattern)):
## argument is not an atomic vector; coercing
1]
GseaResH1[,## pathway
## 1: HALLMARK_MTORC1_SIGNALING
## 2: HALLMARK_KRAS_SIGNALING_UP
if(dim(GseaResH1)[1] > 0){
plotGseaTable(H1[GseaResH1 %>% dplyr::arrange(NES) %>% dplyr::pull(pathway)],
gseaParam=0.5)
FStat, GseaSelH1, }
fgsea analysis: Kegg pathways
set.seed(42)
<- fgsea(FStat, pathways=Kegg, nperm=75000, maxSize=500, minSize=10, nproc=1)
GseaSelKegg ## Warning in fgsea(FStat, pathways = Kegg, nperm = 75000, maxSize = 500,
## minSize = 10, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.03% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
::fwrite(GseaSelKegg, file=paste0(OutputFolder, '/Gsea/GseaSelKegg.txt'), sep="\t", sep2=c("", " ", ""))
data.table
<- dplyr::filter(GseaSelKegg, padj < 0.1 & abs(NES) > 2) %>%
GseaResKegg ::mutate(LEGenes=stringr::str_count(leadingEdge, ",")+1) %>%
dplyr::mutate(GeneSet='Kegg') %>%
dplyr::select(1, 2, 3, 5, 9, 10)
dplyr## Warning in stri_count_regex(string, pattern, opts_regex = opts(pattern)):
## argument is not an atomic vector; coercing
1]
GseaResKegg[,## pathway
## 1: KEGG_OXIDATIVE_PHOSPHORYLATION
## 2: KEGG_RIBOSOME
## 3: KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION
## 4: KEGG_ALZHEIMERS_DISEASE
## 5: KEGG_PARKINSONS_DISEASE
## 6: KEGG_HUNTINGTONS_DISEASE
## 7: KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS
if(dim(GseaResKegg)[1] > 0){
plotGseaTable(Kegg[GseaResKegg %>% dplyr::arrange(NES) %>% dplyr::pull(pathway)],
gseaParam=0.5)
FStat, GseaSelKegg, }
<- rbind(GseaResH1, GseaResKegg)
GseaRes <- c(H1='#e76bf3', Kegg="#39b600")
Cols
<- ggplot(GseaRes, aes(x=NES, y=-log10(padj), fill=GeneSet, size=LEGenes)) +
BubbleGsea geom_point(alpha=0.65, shape=21, color='black') +
#geom_text_repel(label=GseaRes$pathway, cex=1.8,
#segment.color="grey50", segment.size=0.3,
#force=20, nudge_y=-0.15) +
geom_vline(xintercept = 0, col='blue') +
xlab('NES') + ylab('-log10 Adjusted PVal') +
scale_fill_manual(values=Cols) +
scale_size(range=c(3,15), name='Leading Edge \n Genes') +
theme_bw()
BubbleGsea
<- ggplot(GseaRes, aes(x=NES, y=-log10(padj), fill=GeneSet, size=LEGenes)) +
BubbleGsea geom_point(alpha=0.65, shape=21, color='black') +
geom_text_repel(label=GseaRes$pathway, cex=3.5,
segment.color="grey50", segment.size=0.3,
force=20, nudge_y=-0.3,nudge_x=-0.4) +
geom_vline(xintercept = 0, col='blue') +
xlab('NES') + ylab('-log10 Adjusted PVal') +
scale_fill_manual(values=Cols) +
scale_size(range=c(3,15), name='Leading Edge \n Genes') +
theme_bw()
BubbleGsea
ggsave(filename=paste0(OutputFolder, '/Gsea/Bubble.pdf'), BubbleGsea,
width=6.5, height=5.5)