Load environment
Code
import warnings
warnings.filterwarnings("ignore" )
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools
from tqdm import tqdm
import decoupler as dc
import sys
sys.setrecursionlimit(20000 )
sys.path.append("./../../../../utilities_folder" )
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors
Set R environment with rpy2:
Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
% load_ext rpy2.ipython
Set up of graphical parameters for Python plots:
Code
% matplotlib inline
sc.set_figure_params(dpi = 300 , fontsize = 20 )
plt.rcParams['svg.fonttype' ] = 'none'
cmap_up = sns.light_palette("red" , as_cmap= True )
cmap_down = sns.light_palette("blue" , as_cmap= True )
cmap_all = sns.light_palette("seagreen" , as_cmap= True )
Set up of graphical parameters for R plots:
Code
default_units = 'in'
default_res = 300
default_width = 10
default_height = 9
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self , args):
if getattr (args, 'units' ) is not None :
if args.units != default_units:
return old_setup_graphics(self , args)
args.units = default_units
if getattr (args, 'res' ) is None :
args.res = default_res
if getattr (args, 'width' ) is None :
args.width = default_width
if getattr (args, 'height' ) is None :
args.height = default_height
return old_setup_graphics(self , args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
Here the cell were we inject the parameters using Quarto renderer:
Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 5
Import R libraries:
Code
%% R
source('./../../../../utilities_folder/GO_helper.r' )
loc <- './../../../../R_loc' # pointing to the renv environment
.libPaths(loc)
library('topGO' )
library('org.Hs.eg.db' )
library(dplyr)
library(ggplot2)
Set output folders:
Code
output_folder = './'
folder = './tables/cluster_' + str (N) + '/'
Load data
Here we load the dataframe:
Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str (N) + '.xlsx' , index_col = 0 )
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
SFRP1
0.304503
-6.025349
-3.318758
8.701454
1216.124875
2.326079e-263
1.525679e-261
5
GPR176
0.055436
-5.634053
-1.914052
7.020988
997.010472
8.010355e-216
3.876947e-214
5
FZD7
1.207805
-4.431094
0.593172
8.144837
954.999700
1.039551e-206
4.726412e-205
5
CLU
-0.392922
-5.454956
0.184910
7.602443
909.108872
9.359262e-197
3.969584e-195
5
PALM2-AKAP2
0.618921
-4.712517
-0.357050
7.166261
886.211028
8.667986e-192
3.503092e-190
5
...
...
...
...
...
...
...
...
...
EIF4EBP3
-0.541446
-0.924839
-1.057000
3.144133
13.380507
3.881993e-03
5.629524e-03
5
MSRA
-0.080158
-0.262529
-1.120385
4.111732
13.374649
3.892634e-03
5.640865e-03
5
ZSCAN31
-0.219423
-1.015926
-0.543040
3.236042
12.788156
5.117865e-03
7.312192e-03
5
ING4
-0.520919
-0.495369
-1.044443
4.163819
12.772198
5.156056e-03
7.362377e-03
5
NHLRC3
-0.667431
-1.083339
-0.381618
2.828116
12.732249
5.252907e-03
7.493540e-03
5
1384 rows × 8 columns
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv' )
allGenes = allGenes_series['0' ].tolist()
Here we load the dictionary that associates to each GO term its genes:
Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle' )
Markers of cluster
We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):
Code
markers = markers[markers.FDR < 0.01 ]
markers['-log10(FDR)' ] = - np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)' ] != np.inf]['-log10(FDR)' ].max ())
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
-log10(FDR)
SFRP1
0.304503
-6.025349
-3.318758
8.701454
1216.124875
2.326079e-263
1.525679e-261
5
260.816537
GPR176
0.055436
-5.634053
-1.914052
7.020988
997.010472
8.010355e-216
3.876947e-214
5
213.411510
FZD7
1.207805
-4.431094
0.593172
8.144837
954.999700
1.039551e-206
4.726412e-205
5
204.325468
CLU
-0.392922
-5.454956
0.184910
7.602443
909.108872
9.359262e-197
3.969584e-195
5
194.401255
PALM2-AKAP2
0.618921
-4.712517
-0.357050
7.166261
886.211028
8.667986e-192
3.503092e-190
5
189.455548
...
...
...
...
...
...
...
...
...
...
EIF4EBP3
-0.541446
-0.924839
-1.057000
3.144133
13.380507
3.881993e-03
5.629524e-03
5
2.249528
MSRA
-0.080158
-0.262529
-1.120385
4.111732
13.374649
3.892634e-03
5.640865e-03
5
2.248654
ZSCAN31
-0.219423
-1.015926
-0.543040
3.236042
12.788156
5.117865e-03
7.312192e-03
5
2.135952
ING4
-0.520919
-0.495369
-1.044443
4.163819
12.772198
5.156056e-03
7.362377e-03
5
2.132982
NHLRC3
-0.667431
-1.083339
-0.381618
2.828116
12.732249
5.252907e-03
7.493540e-03
5
2.125313
1384 rows × 9 columns
All regulated
Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0' ].isin(all_sign).astype('int' ).tolist()
topGO
All significant
Code
%% R - i allSelected - i allGenes
allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto= 'BP' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_MF <- topGO::annFUN.org(whichOnto= 'MF' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_CC <- topGO::annFUN.org(whichOnto= 'CC' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
selection <- function(allScores){return (as .logical(allScores))}
Code
%% R
#print(lapply(ann_org_BP, count_genes))
GOdata <- new("topGOdata" ,
ontology= "BP" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_BP,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903, 1259, 10, 5298], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py
GO.ID
Term
Annotated
Significant
Expected
weight
Scores
0
GO:0009113
purine nucleobase biosynthetic process
10
7
1.15
2.4e-05
0.000024
1
GO:0001657
ureteric bud development
65
19
7.51
6.7e-05
0.000067
2
GO:0007612
learning
100
20
11.55
9.6e-05
0.000096
3
GO:0009083
branched-chain amino acid catabolic proc...
20
9
2.31
0.00018
0.000182
4
GO:0051056
regulation of small GTPase mediated sign...
243
37
28.06
0.00038
0.000377
...
...
...
...
...
...
...
...
5697
GO:2000402
negative regulation of lymphocyte migrat...
10
0
1.15
1.00000
1.000000
5698
GO:2000434
regulation of protein neddylation
17
0
1.96
1.00000
1.000000
5699
GO:2000679
positive regulation of transcription reg...
15
0
1.73
1.00000
1.000000
5700
GO:2000819
regulation of nucleotide-excision repair
26
0
3.00
1.00000
1.000000
5701
GO:2001169
regulation of ATP biosynthetic process
15
0
1.73
1.00000
1.000000
5702 rows × 7 columns
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
Code
results_table_py['-log10(pvalue)' ] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated' ] = results_table_py['Significant' ] / results_table_py['Annotated' ]
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx' , save = True )
Code
%% R - i folder
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_BP_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "MF" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_MF,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203, 1291, 10, 914], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_MF_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "CC" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_CC,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323, 1309, 10, 663], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes:_in_Term_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_CC_genesInTerm_all.xlsx' ), SE = markers)
Reactome
Code
curated = msigdb[msigdb['collection' ].isin(['reactome_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)' ])
rank_copy = rank.copy()
rank_copy['pval' ] = markers.loc[rank.index].FDR
Code
-log10(FDR)
pval
SFRP1
260.816537
1.525679e-261
GPR176
213.411510
3.876947e-214
FZD7
204.325468
4.726412e-205
CLU
194.401255
3.969584e-195
PALM2-AKAP2
189.455548
3.503092e-190
...
...
...
EIF4EBP3
2.249528
5.629524e-03
MSRA
2.248654
5.640865e-03
ZSCAN31
2.135952
7.312192e-03
ING4
2.132982
7.362377e-03
NHLRC3
2.125313
7.493540e-03
1384 rows × 2 columns
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
len (results_table_py)
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways' ], rank_copy)
_, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways' ], DEGs = rank_copy, n_top_terms = 10 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx' , save = True )
KEGG
Code
curated = msigdb[msigdb['collection' ].isin(['kegg_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways' ], rank_copy)
_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways' ], rank_copy, n_top_terms = 10 , n_top_genes = 15 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx' , save = True )