Load environment
Code
import warnings
warnings.filterwarnings("ignore" )
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools
from tqdm import tqdm
import decoupler as dc
import sys
sys.setrecursionlimit(20000 )
sys.path.append("./../../../../utilities_folder" )
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors
Set R environment with rpy2:
Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
% load_ext rpy2.ipython
Set up of graphical parameters for Python plots:
Code
% matplotlib inline
sc.set_figure_params(dpi = 300 , fontsize = 20 )
plt.rcParams['svg.fonttype' ] = 'none'
cmap_up = sns.light_palette("red" , as_cmap= True )
cmap_down = sns.light_palette("blue" , as_cmap= True )
cmap_all = sns.light_palette("seagreen" , as_cmap= True )
Set up of graphical parameters for R plots:
Code
default_units = 'in'
default_res = 300
default_width = 10
default_height = 9
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self , args):
if getattr (args, 'units' ) is not None :
if args.units != default_units:
return old_setup_graphics(self , args)
args.units = default_units
if getattr (args, 'res' ) is None :
args.res = default_res
if getattr (args, 'width' ) is None :
args.width = default_width
if getattr (args, 'height' ) is None :
args.height = default_height
return old_setup_graphics(self , args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
Here the cell were we inject the parameters using Quarto renderer:
Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 6
Import R libraries:
Code
%% R
source('./../../../../utilities_folder/GO_helper.r' )
loc <- './../../../../R_loc' # pointing to the renv environment
.libPaths(loc)
library('topGO' )
library('org.Hs.eg.db' )
library(dplyr)
library(ggplot2)
Set output folders:
Code
output_folder = './'
folder = './tables/cluster_' + str (N) + '/'
Load data
Here we load the dataframe:
Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str (N) + '.xlsx' , index_col = 0 )
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
ANGPTL4
0
16.728461
0
6.635963
4604.226070
0.000000e+00
0.000000e+00
6
ATP6V1C2
0
14.665205
0
4.343721
1868.060935
0.000000e+00
0.000000e+00
6
BAHCC1
0
15.059887
0
4.783488
2303.220998
0.000000e+00
0.000000e+00
6
CAPN13
0
15.246439
0
4.995978
2516.211782
0.000000e+00
0.000000e+00
6
CD38
0
14.833831
0
4.546918
2059.593173
0.000000e+00
0.000000e+00
6
...
...
...
...
...
...
...
...
...
HR
0
11.179795
0
-0.177908
453.402489
5.970864e-98
7.388483e-97
6
LPAR5
0
11.209256
0
-0.211692
452.402199
9.834900e-98
1.211995e-96
6
LIPT2
0
11.093360
0
-0.177148
445.954774
2.453055e-96
2.953262e-95
6
KCNMB1
0
11.107866
0
-0.180260
444.742688
4.490762e-96
5.379503e-95
6
NANOS1
0
11.091472
0
-0.267174
438.980631
7.956386e-95
9.298978e-94
6
372 rows × 8 columns
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv' )
allGenes = allGenes_series['0' ].tolist()
Here we load the dictionary that associates to each GO term its genes:
Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle' )
Markers of cluster
We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):
Code
markers = markers[markers.FDR < 0.01 ]
markers['-log10(FDR)' ] = - np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)' ] != np.inf]['-log10(FDR)' ].max ())
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
-log10(FDR)
ANGPTL4
0
16.728461
0
6.635963
4604.226070
0.000000e+00
0.000000e+00
6
319.310712
ATP6V1C2
0
14.665205
0
4.343721
1868.060935
0.000000e+00
0.000000e+00
6
319.310712
BAHCC1
0
15.059887
0
4.783488
2303.220998
0.000000e+00
0.000000e+00
6
319.310712
CAPN13
0
15.246439
0
4.995978
2516.211782
0.000000e+00
0.000000e+00
6
319.310712
CD38
0
14.833831
0
4.546918
2059.593173
0.000000e+00
0.000000e+00
6
319.310712
...
...
...
...
...
...
...
...
...
...
HR
0
11.179795
0
-0.177908
453.402489
5.970864e-98
7.388483e-97
6
96.131445
LPAR5
0
11.209256
0
-0.211692
452.402199
9.834900e-98
1.211995e-96
6
95.916499
LIPT2
0
11.093360
0
-0.177148
445.954774
2.453055e-96
2.953262e-95
6
94.529698
KCNMB1
0
11.107866
0
-0.180260
444.742688
4.490762e-96
5.379503e-95
6
94.269258
NANOS1
0
11.091472
0
-0.267174
438.980631
7.956386e-95
9.298978e-94
6
93.031565
372 rows × 9 columns
All regulated
Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0' ].isin(all_sign).astype('int' ).tolist()
topGO
All significant
Code
%% R - i allSelected - i allGenes
allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto= 'BP' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_MF <- topGO::annFUN.org(whichOnto= 'MF' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_CC <- topGO::annFUN.org(whichOnto= 'CC' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
selection <- function(allScores){return (as .logical(allScores))}
Code
%% R
#print(lapply(ann_org_BP, count_genes))
GOdata <- new("topGOdata" ,
ontology= "BP" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_BP,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903, 320, 10, 3782], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py
GO.ID
Term
Annotated
Significant
Expected
weight
Scores
0
GO:0001709
cell fate determination
19
7
0.56
6.5e-07
6.535443e-07
1
GO:0009755
hormone-mediated signaling pathway
131
8
3.84
1.3e-05
1.273941e-05
2
GO:0045944
positive regulation of transcription by ...
873
47
25.62
3.9e-05
3.913958e-05
3
GO:0008544
epidermis development
192
15
5.64
1.0e-04
9.953731e-05
4
GO:0051048
negative regulation of secretion
95
11
2.79
0.00013
1.257691e-04
...
...
...
...
...
...
...
...
5697
GO:2001251
negative regulation of chromosome organi...
88
0
2.58
1.00000
1.000000e+00
5698
GO:2001252
positive regulation of chromosome organi...
93
0
2.73
1.00000
1.000000e+00
5699
GO:2001256
regulation of store-operated calcium ent...
11
0
0.32
1.00000
1.000000e+00
5700
GO:2001258
negative regulation of cation channel ac...
23
0
0.68
1.00000
1.000000e+00
5701
GO:2001267
regulation of cysteine-type endopeptidas...
13
0
0.38
1.00000
1.000000e+00
5702 rows × 7 columns
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
Code
results_table_py['-log10(pvalue)' ] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated' ] = results_table_py['Significant' ] / results_table_py['Annotated' ]
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx' , save = True )
Code
%% R - i folder
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_BP_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "MF" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_MF,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203, 329, 10, 529], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_MF_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "CC" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_CC,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323, 338, 10, 419], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes:_in_Term_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_CC_genesInTerm_all.xlsx' ), SE = markers)
Reactome
Code
curated = msigdb[msigdb['collection' ].isin(['reactome_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)' ])
rank_copy = rank.copy()
rank_copy['pval' ] = markers.loc[rank.index].FDR
Code
-log10(FDR)
pval
ANGPTL4
319.310712
0.000000e+00
ATP6V1C2
319.310712
0.000000e+00
BAHCC1
319.310712
0.000000e+00
CAPN13
319.310712
0.000000e+00
CD38
319.310712
0.000000e+00
...
...
...
HR
96.131445
7.388483e-97
LPAR5
95.916499
1.211995e-96
LIPT2
94.529698
2.953262e-95
KCNMB1
94.269258
5.379503e-95
NANOS1
93.031565
9.298978e-94
372 rows × 2 columns
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
len (results_table_py)
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways' ], rank_copy)
_, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways' ], DEGs = rank_copy, n_top_terms = 10 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx' , save = True )
KEGG
Code
curated = msigdb[msigdb['collection' ].isin(['kegg_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways' ], rank_copy)
_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways' ], rank_copy, n_top_terms = 10 , n_top_genes = 15 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx' , save = True )