Load environment
Code
import warnings
warnings.filterwarnings("ignore" )
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools
from tqdm import tqdm
import decoupler as dc
import sys
sys.setrecursionlimit(20000 )
sys.path.append("./../../../../utilities_folder" )
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors
Set R environment with rpy2:
Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
% load_ext rpy2.ipython
Set up of graphical parameters for Python plots:
Code
% matplotlib inline
sc.set_figure_params(dpi = 300 , fontsize = 20 )
plt.rcParams['svg.fonttype' ] = 'none'
cmap_up = sns.light_palette("red" , as_cmap= True )
cmap_down = sns.light_palette("blue" , as_cmap= True )
cmap_all = sns.light_palette("seagreen" , as_cmap= True )
Set up of graphical parameters for R plots:
Code
default_units = 'in'
default_res = 300
default_width = 10
default_height = 9
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self , args):
if getattr (args, 'units' ) is not None :
if args.units != default_units:
return old_setup_graphics(self , args)
args.units = default_units
if getattr (args, 'res' ) is None :
args.res = default_res
if getattr (args, 'width' ) is None :
args.width = default_width
if getattr (args, 'height' ) is None :
args.height = default_height
return old_setup_graphics(self , args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
Here the cell were we inject the parameters using Quarto renderer:
Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 4
Import R libraries:
Code
%% R
source('./../../../../utilities_folder/GO_helper.r' )
loc <- './../../../../R_loc' # pointing to the renv environment
.libPaths(loc)
library('topGO' )
library('org.Hs.eg.db' )
library(dplyr)
library(ggplot2)
Set output folders:
Code
output_folder = './'
folder = './tables/cluster_' + str (N) + '/'
Load data
Here we load the dataframe:
Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str (N) + '.xlsx' , index_col = 0 )
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
ABHD12B
0
15.225769
12.077197
5.063782
2186.500079
0.000000e+00
0.000000e+00
4
CACNA2D3
0
14.639004
11.085077
4.337990
1599.082219
0.000000e+00
0.000000e+00
4
COL9A2
0
16.942461
11.236412
6.876343
4490.375695
0.000000e+00
0.000000e+00
4
CXCL14
0
14.589181
11.598849
4.273718
1527.804714
0.000000e+00
0.000000e+00
4
FNDC5
0
14.777348
11.076017
4.512494
1745.937504
0.000000e+00
0.000000e+00
4
...
...
...
...
...
...
...
...
...
CCL26
0
10.368708
11.340959
0.881519
407.278705
5.868150e-88
6.103588e-87
4
KIF24
0
10.836022
10.929464
0.708684
401.319737
1.146277e-86
1.163040e-85
4
SOX5
0
10.673462
10.965887
0.698247
394.991734
2.691313e-85
2.654382e-84
4
ZXDB
0
10.747729
10.835957
0.642600
391.581011
1.474754e-84
1.424092e-83
4
FDXACB1
0
10.519163
10.969153
0.647752
385.591064
2.924757e-83
2.757727e-82
4
90 rows × 8 columns
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv' )
allGenes = allGenes_series['0' ].tolist()
Here we load the dictionary that associates to each GO term its genes:
Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle' )
Markers of cluster
We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):
Code
markers = markers[markers.FDR < 0.01 ]
markers['-log10(FDR)' ] = - np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)' ] != np.inf]['-log10(FDR)' ].max ())
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
-log10(FDR)
ABHD12B
0
15.225769
12.077197
5.063782
2186.500079
0.000000e+00
0.000000e+00
4
305.626059
CACNA2D3
0
14.639004
11.085077
4.337990
1599.082219
0.000000e+00
0.000000e+00
4
305.626059
COL9A2
0
16.942461
11.236412
6.876343
4490.375695
0.000000e+00
0.000000e+00
4
305.626059
CXCL14
0
14.589181
11.598849
4.273718
1527.804714
0.000000e+00
0.000000e+00
4
305.626059
FNDC5
0
14.777348
11.076017
4.512494
1745.937504
0.000000e+00
0.000000e+00
4
305.626059
...
...
...
...
...
...
...
...
...
...
CCL26
0
10.368708
11.340959
0.881519
407.278705
5.868150e-88
6.103588e-87
4
86.214415
KIF24
0
10.836022
10.929464
0.708684
401.319737
1.146277e-86
1.163040e-85
4
84.934405
SOX5
0
10.673462
10.965887
0.698247
394.991734
2.691313e-85
2.654382e-84
4
83.576037
ZXDB
0
10.747729
10.835957
0.642600
391.581011
1.474754e-84
1.424092e-83
4
82.846462
FDXACB1
0
10.519163
10.969153
0.647752
385.591064
2.924757e-83
2.757727e-82
4
81.559449
90 rows × 9 columns
All regulated
Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0' ].isin(all_sign).astype('int' ).tolist()
topGO
All significant
Code
%% R - i allSelected - i allGenes
allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto= 'BP' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_MF <- topGO::annFUN.org(whichOnto= 'MF' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_CC <- topGO::annFUN.org(whichOnto= 'CC' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
selection <- function(allScores){return (as .logical(allScores))}
Code
%% R
#print(lapply(ann_org_BP, count_genes))
GOdata <- new("topGOdata" ,
ontology= "BP" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_BP,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903, 76, 10, 1876], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py
GO.ID
Term
Annotated
Significant
Expected
weight
Scores
0
GO:0003281
ventricular septum development
54
4
0.38
0.00044
0.000442
1
GO:0048844
artery morphogenesis
49
4
0.34
0.00065
0.000649
2
GO:0000122
negative regulation of transcription by ...
679
13
4.73
0.00075
0.000754
3
GO:0070374
positive regulation of ERK1 and ERK2 cas...
108
5
0.75
0.00092
0.000919
4
GO:0032526
response to retinoic acid
75
4
0.52
0.00124
0.001238
...
...
...
...
...
...
...
...
5697
GO:2001256
regulation of store-operated calcium ent...
11
0
0.08
1.00000
1.000000
5698
GO:2001257
regulation of cation channel activity
104
0
0.72
1.00000
1.000000
5699
GO:2001258
negative regulation of cation channel ac...
23
0
0.16
1.00000
1.000000
5700
GO:2001259
positive regulation of cation channel ac...
43
0
0.30
1.00000
1.000000
5701
GO:2001267
regulation of cysteine-type endopeptidas...
13
0
0.09
1.00000
1.000000
5702 rows × 7 columns
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
Code
results_table_py['-log10(pvalue)' ] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated' ] = results_table_py['Significant' ] / results_table_py['Annotated' ]
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx' , save = True )
Code
%% R - i folder
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_BP_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "MF" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_MF,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203, 83, 10, 302], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_MF_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "CC" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_CC,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323, 80, 10, 206], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes:_in_Term_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_CC_genesInTerm_all.xlsx' ), SE = markers)
Reactome
Code
curated = msigdb[msigdb['collection' ].isin(['reactome_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)' ])
rank_copy = rank.copy()
rank_copy['pval' ] = markers.loc[rank.index].FDR
Code
-log10(FDR)
pval
ABHD12B
305.626059
0.000000e+00
CACNA2D3
305.626059
0.000000e+00
COL9A2
305.626059
0.000000e+00
CXCL14
305.626059
0.000000e+00
FNDC5
305.626059
0.000000e+00
...
...
...
CCL26
86.214415
6.103588e-87
KIF24
84.934405
1.163040e-85
SOX5
83.576037
2.654382e-84
ZXDB
82.846462
1.424092e-83
FDXACB1
81.559449
2.757727e-82
90 rows × 2 columns
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
len (results_table_py)
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways' ], rank_copy)
_, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways' ], DEGs = rank_copy, n_top_terms = 10 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx' , save = True )
KEGG
Code
curated = msigdb[msigdb['collection' ].isin(['kegg_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways' ], rank_copy)
_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways' ], rank_copy, n_top_terms = 10 , n_top_genes = 15 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx' , save = True )