Load environment
Code
import warnings
warnings.filterwarnings("ignore" )
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools
from tqdm import tqdm
import decoupler as dc
import sys
sys.setrecursionlimit(20000 )
sys.path.append("./../../../../utilities_folder" )
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors
Set R environment with rpy2:
Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
% load_ext rpy2.ipython
Set up of graphical parameters for Python plots:
Code
% matplotlib inline
sc.set_figure_params(dpi = 300 , fontsize = 20 )
plt.rcParams['svg.fonttype' ] = 'none'
cmap_up = sns.light_palette("red" , as_cmap= True )
cmap_down = sns.light_palette("blue" , as_cmap= True )
cmap_all = sns.light_palette("seagreen" , as_cmap= True )
Set up of graphical parameters for R plots:
Code
default_units = 'in'
default_res = 300
default_width = 10
default_height = 9
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self , args):
if getattr (args, 'units' ) is not None :
if args.units != default_units:
return old_setup_graphics(self , args)
args.units = default_units
if getattr (args, 'res' ) is None :
args.res = default_res
if getattr (args, 'width' ) is None :
args.width = default_width
if getattr (args, 'height' ) is None :
args.height = default_height
return old_setup_graphics(self , args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
Here the cell were we inject the parameters using Quarto renderer:
Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 8
Import R libraries:
Code
%% R
source('./../../../../utilities_folder/GO_helper.r' )
loc <- './../../../../R_loc' # pointing to the renv environment
.libPaths(loc)
library('topGO' )
library('org.Hs.eg.db' )
library(dplyr)
library(ggplot2)
Set output folders:
Code
output_folder = './'
folder = './tables/cluster_' + str (N) + '/'
Load data
Here we load the dataframe:
Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str (N) + '.xlsx' , index_col = 0 )
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
DLL1
-11.228872
3.547954
-11.228872
4.575054
1533.970731
0.000000e+00
0.000000e+00
8
ALOX5
-11.178824
4.410238
-11.178824
5.439678
2245.284054
0.000000e+00
0.000000e+00
8
VSTM2B
-11.122162
4.797431
-11.122162
5.792537
2511.051934
0.000000e+00
0.000000e+00
8
FGF16
-10.984056
3.985368
-10.984056
4.747048
1708.509557
0.000000e+00
0.000000e+00
8
PSTPIP2
-11.105241
3.087326
-11.105241
4.054911
1143.210227
1.536158e-247
8.738628e-246
8
EGFR
-11.738471
2.489754
-11.738471
4.048274
1108.064238
6.479206e-240
3.534996e-238
8
RAI1
-11.129680
2.797734
-11.129680
3.668371
960.524308
6.583281e-208
3.015997e-206
8
C19orf85
-11.388111
2.127670
-11.388111
3.255547
793.364089
1.189597e-171
3.933534e-170
8
MAFB
-12.466067
0.498430
-12.466067
3.313987
738.617005
8.873436e-160
2.610487e-158
8
CCDC163
-10.857963
2.563922
-10.857963
2.945123
735.019095
5.349467e-159
1.550958e-157
8
PDE10A
-11.175752
2.150874
-11.175752
3.017974
727.061220
2.844354e-157
8.167652e-156
8
LDHD
-11.960379
1.040000
-11.960379
2.997689
675.456270
4.404685e-146
1.101447e-144
8
NCALD
-10.934065
2.173099
-10.934065
2.769786
665.571840
6.124912e-144
1.494255e-142
8
GRASP
-11.670363
1.321981
-11.670363
2.916703
661.788511
4.049534e-143
9.819507e-142
8
GPR157
-11.215989
1.261274
-11.215989
2.453822
556.254469
3.063602e-120
5.423660e-119
8
CNTNAP1
-12.219804
-0.561166
-12.219804
2.444274
534.517747
1.576439e-115
2.602751e-114
8
CEACAM19
-11.120045
1.210802
-11.120045
2.218539
515.532181
2.053606e-111
3.180572e-110
8
NKAIN4
-10.870906
1.448048
-10.870906
2.126111
505.760666
2.692995e-109
4.035458e-108
8
NCR3LG1
-12.075949
-0.763917
-12.075949
2.292377
501.060931
2.810288e-108
4.154173e-107
8
SLC22A5
-11.578276
0.170129
-11.578276
2.132203
474.607993
1.517762e-102
2.042342e-101
8
MYO15B
-11.396701
0.537205
-11.396701
2.076268
474.538440
1.571358e-102
2.112095e-101
8
NTRK3
-11.368088
0.347978
-11.368088
1.999594
457.076437
9.549802e-99
1.200275e-97
8
FAM53A
-11.332611
0.395974
-11.332611
1.960959
454.251702
3.908746e-98
4.866875e-97
8
HEATR5A
-11.094525
0.736528
-11.094525
1.902362
452.504752
9.344365e-98
1.152728e-96
8
DUSP9
-11.376535
0.265029
-11.376535
1.962851
450.926894
2.053157e-97
2.517267e-96
8
ICAM1
-11.814895
-1.465547
-11.814895
1.841416
439.487508
6.178718e-95
7.256669e-94
8
ZNF433
-11.228434
0.396925
-11.228434
1.841026
439.185301
7.184112e-95
8.412771e-94
8
PUDP
-10.807817
0.955786
-10.807817
1.727613
434.315764
8.153979e-94
9.365762e-93
8
AKR1E2
-10.936276
0.746199
-10.936276
1.712573
428.947245
1.186971e-92
1.336512e-91
8
SMOC2
-11.450998
-0.291171
-11.450998
1.800132
424.803003
9.381073e-92
1.044536e-90
8
IKZF2
-11.050744
0.516850
-11.050744
1.702677
423.554737
1.748534e-91
1.936130e-90
8
CRYL1
-11.076752
0.406209
-11.076752
1.753451
423.107069
2.186026e-91
2.416103e-90
8
BATF3
-11.071490
0.431127
-11.071490
1.736548
422.716472
2.656274e-91
2.927755e-90
8
ADAM33
-11.528694
-0.870812
-11.528694
1.703740
412.838404
3.665708e-89
3.886881e-88
8
CATSPERG
-10.898724
0.530687
-10.898724
1.521126
402.098026
7.775084e-87
7.908842e-86
8
STX1A
-11.018889
0.158636
-11.018889
1.527679
393.569782
5.469630e-85
5.354973e-84
8
LYNX1
-11.172865
-0.186924
-11.172865
1.548000
393.068396
7.023559e-85
6.865128e-84
8
HES4
-11.378759
-1.021150
-11.378759
1.538287
390.663797
2.330148e-84
2.246487e-83
8
ANKRD37
-11.149876
-0.330301
-11.149876
1.487399
383.743503
7.349283e-83
6.875561e-82
8
RIPPLY3
-10.987020
0.139181
-10.987020
1.374208
381.036912
2.834293e-82
2.627028e-81
8
CUL9
-10.986253
-0.066411
-10.986253
1.386002
374.538964
7.240039e-81
6.548771e-80
8
CDK3
-10.801797
0.293696
-10.801797
1.329396
372.648605
1.858379e-80
1.667124e-79
8
MUC1
-10.961591
-0.143128
-10.961591
1.344446
367.871679
2.012074e-79
1.781042e-78
8
ARIH2OS
-10.922176
-0.081686
-10.922176
1.322430
366.152519
4.741798e-79
4.163555e-78
8
ERMAP
-10.898001
-0.397724
-10.898001
1.211818
351.049294
8.840368e-76
7.307915e-75
8
TRIM72
-10.698973
-0.353230
-10.698973
1.014147
331.780927
1.313276e-71
9.976739e-71
8
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv' )
allGenes = allGenes_series['0' ].tolist()
Here we load the dictionary that associates to each GO term its genes:
Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle' )
Markers of cluster
We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):
Code
markers = markers[markers.FDR < 0.01 ]
markers['-log10(FDR)' ] = - np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)' ] != np.inf]['-log10(FDR)' ].max ())
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
-log10(FDR)
DLL1
-11.228872
3.547954
-11.228872
4.575054
1533.970731
0.000000e+00
0.000000e+00
8
245.058557
ALOX5
-11.178824
4.410238
-11.178824
5.439678
2245.284054
0.000000e+00
0.000000e+00
8
245.058557
VSTM2B
-11.122162
4.797431
-11.122162
5.792537
2511.051934
0.000000e+00
0.000000e+00
8
245.058557
FGF16
-10.984056
3.985368
-10.984056
4.747048
1708.509557
0.000000e+00
0.000000e+00
8
245.058557
PSTPIP2
-11.105241
3.087326
-11.105241
4.054911
1143.210227
1.536158e-247
8.738628e-246
8
245.058557
EGFR
-11.738471
2.489754
-11.738471
4.048274
1108.064238
6.479206e-240
3.534996e-238
8
237.451611
RAI1
-11.129680
2.797734
-11.129680
3.668371
960.524308
6.583281e-208
3.015997e-206
8
205.520569
C19orf85
-11.388111
2.127670
-11.388111
3.255547
793.364089
1.189597e-171
3.933534e-170
8
169.405217
MAFB
-12.466067
0.498430
-12.466067
3.313987
738.617005
8.873436e-160
2.610487e-158
8
157.583279
CCDC163
-10.857963
2.563922
-10.857963
2.945123
735.019095
5.349467e-159
1.550958e-157
8
156.809400
PDE10A
-11.175752
2.150874
-11.175752
3.017974
727.061220
2.844354e-157
8.167652e-156
8
155.087903
LDHD
-11.960379
1.040000
-11.960379
2.997689
675.456270
4.404685e-146
1.101447e-144
8
143.958037
NCALD
-10.934065
2.173099
-10.934065
2.769786
665.571840
6.124912e-144
1.494255e-142
8
141.825575
GRASP
-11.670363
1.321981
-11.670363
2.916703
661.788511
4.049534e-143
9.819507e-142
8
141.007910
GPR157
-11.215989
1.261274
-11.215989
2.453822
556.254469
3.063602e-120
5.423660e-119
8
118.265708
CNTNAP1
-12.219804
-0.561166
-12.219804
2.444274
534.517747
1.576439e-115
2.602751e-114
8
113.584567
CEACAM19
-11.120045
1.210802
-11.120045
2.218539
515.532181
2.053606e-111
3.180572e-110
8
109.497495
NKAIN4
-10.870906
1.448048
-10.870906
2.126111
505.760666
2.692995e-109
4.035458e-108
8
107.394107
NCR3LG1
-12.075949
-0.763917
-12.075949
2.292377
501.060931
2.810288e-108
4.154173e-107
8
106.381515
SLC22A5
-11.578276
0.170129
-11.578276
2.132203
474.607993
1.517762e-102
2.042342e-101
8
100.689871
MYO15B
-11.396701
0.537205
-11.396701
2.076268
474.538440
1.571358e-102
2.112095e-101
8
100.675286
NTRK3
-11.368088
0.347978
-11.368088
1.999594
457.076437
9.549802e-99
1.200275e-97
8
96.920719
FAM53A
-11.332611
0.395974
-11.332611
1.960959
454.251702
3.908746e-98
4.866875e-97
8
96.312750
HEATR5A
-11.094525
0.736528
-11.094525
1.902362
452.504752
9.344365e-98
1.152728e-96
8
95.938273
DUSP9
-11.376535
0.265029
-11.376535
1.962851
450.926894
2.053157e-97
2.517267e-96
8
95.599071
ICAM1
-11.814895
-1.465547
-11.814895
1.841416
439.487508
6.178718e-95
7.256669e-94
8
93.139263
ZNF433
-11.228434
0.396925
-11.228434
1.841026
439.185301
7.184112e-95
8.412771e-94
8
93.075061
PUDP
-10.807817
0.955786
-10.807817
1.727613
434.315764
8.153979e-94
9.365762e-93
8
92.028457
AKR1E2
-10.936276
0.746199
-10.936276
1.712573
428.947245
1.186971e-92
1.336512e-91
8
90.874027
SMOC2
-11.450998
-0.291171
-11.450998
1.800132
424.803003
9.381073e-92
1.044536e-90
8
89.981076
IKZF2
-11.050744
0.516850
-11.050744
1.702677
423.554737
1.748534e-91
1.936130e-90
8
89.713065
CRYL1
-11.076752
0.406209
-11.076752
1.753451
423.107069
2.186026e-91
2.416103e-90
8
89.616885
BATF3
-11.071490
0.431127
-11.071490
1.736548
422.716472
2.656274e-91
2.927755e-90
8
89.533465
ADAM33
-11.528694
-0.870812
-11.528694
1.703740
412.838404
3.665708e-89
3.886881e-88
8
87.410399
CATSPERG
-10.898724
0.530687
-10.898724
1.521126
402.098026
7.775084e-87
7.908842e-86
8
85.101887
STX1A
-11.018889
0.158636
-11.018889
1.527679
393.569782
5.469630e-85
5.354973e-84
8
83.271243
LYNX1
-11.172865
-0.186924
-11.172865
1.548000
393.068396
7.023559e-85
6.865128e-84
8
83.163351
HES4
-11.378759
-1.021150
-11.378759
1.538287
390.663797
2.330148e-84
2.246487e-83
8
82.648496
ANKRD37
-11.149876
-0.330301
-11.149876
1.487399
383.743503
7.349283e-83
6.875561e-82
8
81.162692
RIPPLY3
-10.987020
0.139181
-10.987020
1.374208
381.036912
2.834293e-82
2.627028e-81
8
80.580535
CUL9
-10.986253
-0.066411
-10.986253
1.386002
374.538964
7.240039e-81
6.548771e-80
8
79.183840
CDK3
-10.801797
0.293696
-10.801797
1.329396
372.648605
1.858379e-80
1.667124e-79
8
78.778032
MUC1
-10.961591
-0.143128
-10.961591
1.344446
367.871679
2.012074e-79
1.781042e-78
8
77.749326
ARIH2OS
-10.922176
-0.081686
-10.922176
1.322430
366.152519
4.741798e-79
4.163555e-78
8
77.380536
ERMAP
-10.898001
-0.397724
-10.898001
1.211818
351.049294
8.840368e-76
7.307915e-75
8
74.136206
TRIM72
-10.698973
-0.353230
-10.698973
1.014147
331.780927
1.313276e-71
9.976739e-71
8
70.001011
All regulated
Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0' ].isin(all_sign).astype('int' ).tolist()
topGO
All significant
Code
%% R - i allSelected - i allGenes
allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto= 'BP' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_MF <- topGO::annFUN.org(whichOnto= 'MF' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_CC <- topGO::annFUN.org(whichOnto= 'CC' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
selection <- function(allScores){return (as .logical(allScores))}
Code
%% R
#print(lapply(ann_org_BP, count_genes))
GOdata <- new("topGOdata" ,
ontology= "BP" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_BP,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903, 38, 10, 1350], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py
GO.ID
Term
Annotated
Significant
Expected
weight
Scores
0
GO:0048665
neuron fate specification
13
2
0.05
0.0009
0.000901
1
GO:0061042
vascular wound healing
13
2
0.05
0.0009
0.000901
2
GO:1900746
regulation of vascular endothelial growt...
15
2
0.05
0.0012
0.001207
3
GO:2001028
positive regulation of endothelial cell ...
15
2
0.05
0.0012
0.001207
4
GO:0051154
negative regulation of striated muscle c...
24
2
0.08
0.0031
0.003110
...
...
...
...
...
...
...
...
5697
GO:2001256
regulation of store-operated calcium ent...
11
0
0.04
1.0000
1.000000
5698
GO:2001257
regulation of cation channel activity
104
0
0.36
1.0000
1.000000
5699
GO:2001258
negative regulation of cation channel ac...
23
0
0.08
1.0000
1.000000
5700
GO:2001259
positive regulation of cation channel ac...
43
0
0.15
1.0000
1.000000
5701
GO:2001267
regulation of cysteine-type endopeptidas...
13
0
0.05
1.0000
1.000000
5702 rows × 7 columns
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
Code
results_table_py['-log10(pvalue)' ] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated' ] = results_table_py['Significant' ] / results_table_py['Annotated' ]
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx' , save = True )
Code
%% R - i folder
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_BP_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "MF" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_MF,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203, 37, 10, 189], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_MF_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "CC" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_CC,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323, 43, 10, 202], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes:_in_Term_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_CC_genesInTerm_all.xlsx' ), SE = markers)
Reactome
Code
curated = msigdb[msigdb['collection' ].isin(['reactome_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)' ])
rank_copy = rank.copy()
rank_copy['pval' ] = markers.loc[rank.index].FDR
Code
-log10(FDR)
pval
DLL1
245.058557
0.000000e+00
ALOX5
245.058557
0.000000e+00
VSTM2B
245.058557
0.000000e+00
FGF16
245.058557
0.000000e+00
PSTPIP2
245.058557
8.738628e-246
EGFR
237.451611
3.534996e-238
RAI1
205.520569
3.015997e-206
C19orf85
169.405217
3.933534e-170
MAFB
157.583279
2.610487e-158
CCDC163
156.809400
1.550958e-157
PDE10A
155.087903
8.167652e-156
LDHD
143.958037
1.101447e-144
NCALD
141.825575
1.494255e-142
GRASP
141.007910
9.819507e-142
GPR157
118.265708
5.423660e-119
CNTNAP1
113.584567
2.602751e-114
CEACAM19
109.497495
3.180572e-110
NKAIN4
107.394107
4.035458e-108
NCR3LG1
106.381515
4.154173e-107
SLC22A5
100.689871
2.042342e-101
MYO15B
100.675286
2.112095e-101
NTRK3
96.920719
1.200275e-97
FAM53A
96.312750
4.866875e-97
HEATR5A
95.938273
1.152728e-96
DUSP9
95.599071
2.517267e-96
ICAM1
93.139263
7.256669e-94
ZNF433
93.075061
8.412771e-94
PUDP
92.028457
9.365762e-93
AKR1E2
90.874027
1.336512e-91
SMOC2
89.981076
1.044536e-90
IKZF2
89.713065
1.936130e-90
CRYL1
89.616885
2.416103e-90
BATF3
89.533465
2.927755e-90
ADAM33
87.410399
3.886881e-88
CATSPERG
85.101887
7.908842e-86
STX1A
83.271243
5.354973e-84
LYNX1
83.163351
6.865128e-84
HES4
82.648496
2.246487e-83
ANKRD37
81.162692
6.875561e-82
RIPPLY3
80.580535
2.627028e-81
CUL9
79.183840
6.548771e-80
CDK3
78.778032
1.667124e-79
MUC1
77.749326
1.781042e-78
ARIH2OS
77.380536
4.163555e-78
ERMAP
74.136206
7.307915e-75
TRIM72
70.001011
9.976739e-71
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
len (results_table_py)
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways' ], rank_copy)
_, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways' ], DEGs = rank_copy, n_top_terms = 10 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx' , save = True )
KEGG
Code
curated = msigdb[msigdb['collection' ].isin(['kegg_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways' ], rank_copy)
_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways' ], rank_copy, n_top_terms = 10 , n_top_genes = 15 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx' , save = True )