Load environment
Code
import warnings
warnings.filterwarnings("ignore" )
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools
from tqdm import tqdm
import decoupler as dc
import sys
sys.setrecursionlimit(20000 )
sys.path.append("./../../../../utilities_folder" )
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors
Set R environment with rpy2:
Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
% load_ext rpy2.ipython
Set up of graphical parameters for Python plots:
Code
% matplotlib inline
sc.set_figure_params(dpi = 300 , fontsize = 20 )
plt.rcParams['svg.fonttype' ] = 'none'
cmap_up = sns.light_palette("red" , as_cmap= True )
cmap_down = sns.light_palette("blue" , as_cmap= True )
cmap_all = sns.light_palette("seagreen" , as_cmap= True )
Set up of graphical parameters for R plots:
Code
default_units = 'in'
default_res = 300
default_width = 10
default_height = 9
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self , args):
if getattr (args, 'units' ) is not None :
if args.units != default_units:
return old_setup_graphics(self , args)
args.units = default_units
if getattr (args, 'res' ) is None :
args.res = default_res
if getattr (args, 'width' ) is None :
args.width = default_width
if getattr (args, 'height' ) is None :
args.height = default_height
return old_setup_graphics(self , args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
Here the cell were we inject the parameters using Quarto renderer:
Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 0
Import R libraries:
Code
%% R
source('./../../../../utilities_folder/GO_helper.r' )
loc <- './../../../../R_loc' # pointing to the renv environment
.libPaths(loc)
library('topGO' )
library('org.Hs.eg.db' )
library(dplyr)
library(ggplot2)
Set output folders:
Code
output_folder = './'
folder = './tables/cluster_' + str (N) + '/'
Load data
Here we load the dataframe:
Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str (N) + '.xlsx' , index_col = 0 )
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
CXCR4
-11.172324
7.322004
0.804604
8.483506
3951.022844
0.000000e+00
0.000000e+00
0
FGFR3
-11.002481
4.398754
2.306831
5.388332
1732.310638
0.000000e+00
0.000000e+00
0
ZNF703
-11.000985
6.292293
2.813550
7.291843
3595.651687
0.000000e+00
0.000000e+00
0
MORC4
-10.933221
4.944114
0.058590
5.774889
2161.147921
0.000000e+00
0.000000e+00
0
KLF4
-10.795117
6.570967
2.325890
7.348871
3722.692849
0.000000e+00
0.000000e+00
0
ZYG11A
-11.832004
3.313038
0.512455
5.101194
1489.567078
1.086944e-322
1.165007e-320
0
VENTX
-11.812635
3.107866
-0.353142
4.823694
1344.925545
2.628696e-291
2.206450e-289
0
WLS
-11.822818
1.642495
3.544481
4.750616
1214.091220
6.424869e-263
4.191180e-261
0
PLAAT5
-11.705724
2.655509
2.072497
4.575988
1072.617287
3.174551e-232
1.686024e-230
0
FOSB
-10.759300
2.303687
3.482677
3.882798
786.119387
4.431795e-170
1.445512e-168
0
SMAD6
-11.133028
2.567099
0.787656
3.688101
711.711762
6.059761e-154
1.668241e-152
0
SFMBT2
-11.301186
2.386058
0.675743
3.661201
694.787689
2.833088e-150
7.424793e-149
0
CPEB2
-10.992624
2.551735
1.528002
3.619749
680.196740
4.130942e-147
1.048281e-145
0
MYB
-11.219598
2.138289
0.235243
3.345501
607.049020
2.987073e-131
6.181695e-130
0
HTR1D
-11.940274
-0.000371
1.866980
3.475240
597.634960
3.281710e-129
6.543250e-128
0
FLRT2
-10.661898
-10.661898
2.594260
2.418750
577.838936
6.418928e-125
1.217163e-123
0
POMC
-11.098515
1.907786
0.257493
3.080348
531.334862
7.718626e-115
1.255375e-113
0
TGFBR3L
-12.285498
-12.285498
-1.135665
2.311232
516.979118
9.975280e-112
1.559027e-110
0
CD1D
-11.036460
1.641919
1.741398
3.076538
514.638767
3.207336e-111
4.941932e-110
0
PLEKHA7
-11.399815
1.461997
0.103141
3.049715
504.889676
4.159066e-109
6.224597e-108
0
IRF9
-11.055282
1.867515
0.647282
2.966910
504.741364
4.478554e-109
6.686079e-108
0
CUZD1
-11.654184
-11.654184
0.713973
2.272964
502.745032
1.212771e-108
1.797146e-107
0
CR1L
-11.365167
-11.365167
1.158795
2.204925
494.362252
7.951324e-107
1.145735e-105
0
CEBPD
-11.145091
1.602761
-0.036129
2.902001
482.454019
3.026958e-104
4.185780e-103
0
SP8
-11.180913
-11.180913
1.308208
2.113101
477.366432
3.832386e-103
5.209528e-102
0
CPNE8
-11.789643
0.809946
-0.775524
2.867771
461.385381
1.112622e-99
1.417707e-98
0
SOX8
-11.762601
-11.762601
-0.079794
2.081350
460.900518
1.417121e-99
1.798066e-98
0
EGR3
-10.838320
1.267103
1.796737
2.715622
441.429992
2.344481e-95
2.769764e-94
0
TNMD
-11.424248
-11.424248
0.458838
1.950109
441.172775
2.665475e-95
3.142799e-94
0
SPON1
-11.067239
-11.067239
0.987076
1.848919
429.450846
9.232904e-93
1.041565e-91
0
SLC27A2
-11.491278
-11.491278
0.009423
1.858953
425.456535
6.771301e-92
7.581709e-91
0
SLCO5A1
-11.594968
-0.200657
0.874684
2.660573
409.654839
1.793855e-88
1.887085e-87
0
VIPR1
-11.305458
-11.305458
0.147051
1.731361
406.139609
1.035733e-87
1.072640e-86
0
PLIN2
-10.820458
1.107431
1.439851
2.531766
403.988732
3.028009e-87
3.109084e-86
0
PARD3B
-11.203304
-11.203304
0.218544
1.663814
395.953948
1.665512e-85
1.646717e-84
0
BEND6
-11.161487
-11.161487
0.314171
1.652496
395.183341
2.446031e-85
2.414450e-84
0
ERAP1
-11.363301
-11.363301
-0.299452
1.650015
394.521479
3.402681e-85
3.350483e-84
0
GRIA3
-11.241155
-11.241155
0.045862
1.638949
392.317321
1.021499e-84
9.911924e-84
0
CNTFR
-11.318841
0.235529
0.836421
2.524169
386.071293
2.301856e-83
2.172106e-82
0
SLC10A4
-11.028984
-11.028984
0.385695
1.560395
381.607164
2.132746e-82
1.979842e-81
0
CYP27A1
-11.610612
0.093257
-0.242183
2.447008
375.567416
4.335184e-81
3.933123e-80
0
CSF1
-11.009662
-11.009662
0.261565
1.495500
372.099518
2.443703e-80
2.187305e-79
0
EXOG
-10.969232
-10.969232
-0.029743
1.361055
353.100917
3.178516e-76
2.643917e-75
0
NUP62CL
-11.422266
0.052553
-0.540530
2.194061
342.222699
7.205153e-74
5.742593e-73
0
PCDHGC3
-11.148685
-0.327086
0.620974
2.102311
328.821400
5.742157e-71
4.315787e-70
0
EGR2
-10.869229
0.639334
0.303449
1.997976
324.396475
5.212082e-70
3.826337e-69
0
IL17RB
-11.093504
0.061843
0.012494
1.967727
313.503087
1.188802e-67
8.408477e-67
0
NEURL2
-10.862253
0.329810
0.390332
1.930991
310.946811
4.250413e-67
2.985237e-66
0
IRF1
-11.011890
0.122855
-0.082426
1.890139
306.187842
4.555127e-66
3.145868e-65
0
TNFRSF25
-10.973480
0.008988
0.141392
1.822972
300.989375
6.076299e-65
4.132227e-64
0
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv' )
allGenes = allGenes_series['0' ].tolist()
Here we load the dictionary that associates to each GO term its genes:
Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle' )
Markers of cluster
We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):
Code
markers = markers[markers.FDR < 0.01 ]
markers['-log10(FDR)' ] = - np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)' ] != np.inf]['-log10(FDR)' ].max ())
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
-log10(FDR)
CXCR4
-11.172324
7.322004
0.804604
8.483506
3951.022844
0.000000e+00
0.000000e+00
0
319.933672
FGFR3
-11.002481
4.398754
2.306831
5.388332
1732.310638
0.000000e+00
0.000000e+00
0
319.933672
ZNF703
-11.000985
6.292293
2.813550
7.291843
3595.651687
0.000000e+00
0.000000e+00
0
319.933672
MORC4
-10.933221
4.944114
0.058590
5.774889
2161.147921
0.000000e+00
0.000000e+00
0
319.933672
KLF4
-10.795117
6.570967
2.325890
7.348871
3722.692849
0.000000e+00
0.000000e+00
0
319.933672
ZYG11A
-11.832004
3.313038
0.512455
5.101194
1489.567078
1.086944e-322
1.165007e-320
0
319.933672
VENTX
-11.812635
3.107866
-0.353142
4.823694
1344.925545
2.628696e-291
2.206450e-289
0
288.656306
WLS
-11.822818
1.642495
3.544481
4.750616
1214.091220
6.424869e-263
4.191180e-261
0
260.377664
PLAAT5
-11.705724
2.655509
2.072497
4.575988
1072.617287
3.174551e-232
1.686024e-230
0
229.773136
FOSB
-10.759300
2.303687
3.482677
3.882798
786.119387
4.431795e-170
1.445512e-168
0
167.839978
SMAD6
-11.133028
2.567099
0.787656
3.688101
711.711762
6.059761e-154
1.668241e-152
0
151.777741
SFMBT2
-11.301186
2.386058
0.675743
3.661201
694.787689
2.833088e-150
7.424793e-149
0
148.129316
CPEB2
-10.992624
2.551735
1.528002
3.619749
680.196740
4.130942e-147
1.048281e-145
0
144.979522
MYB
-11.219598
2.138289
0.235243
3.345501
607.049020
2.987073e-131
6.181695e-130
0
129.208892
HTR1D
-11.940274
-0.000371
1.866980
3.475240
597.634960
3.281710e-129
6.543250e-128
0
127.184207
FLRT2
-10.661898
-10.661898
2.594260
2.418750
577.838936
6.418928e-125
1.217163e-123
0
122.914651
POMC
-11.098515
1.907786
0.257493
3.080348
531.334862
7.718626e-115
1.255375e-113
0
112.901227
TGFBR3L
-12.285498
-12.285498
-1.135665
2.311232
516.979118
9.975280e-112
1.559027e-110
0
109.807146
CD1D
-11.036460
1.641919
1.741398
3.076538
514.638767
3.207336e-111
4.941932e-110
0
109.306103
PLEKHA7
-11.399815
1.461997
0.103141
3.049715
504.889676
4.159066e-109
6.224597e-108
0
107.205889
IRF9
-11.055282
1.867515
0.647282
2.966910
504.741364
4.478554e-109
6.686079e-108
0
107.174828
CUZD1
-11.654184
-11.654184
0.713973
2.272964
502.745032
1.212771e-108
1.797146e-107
0
106.745417
CR1L
-11.365167
-11.365167
1.158795
2.204925
494.362252
7.951324e-107
1.145735e-105
0
104.940916
CEBPD
-11.145091
1.602761
-0.036129
2.902001
482.454019
3.026958e-104
4.185780e-103
0
102.378224
SP8
-11.180913
-11.180913
1.308208
2.113101
477.366432
3.832386e-103
5.209528e-102
0
101.283202
CPNE8
-11.789643
0.809946
-0.775524
2.867771
461.385381
1.112622e-99
1.417707e-98
0
97.848413
SOX8
-11.762601
-11.762601
-0.079794
2.081350
460.900518
1.417121e-99
1.798066e-98
0
97.745194
EGR3
-10.838320
1.267103
1.796737
2.715622
441.429992
2.344481e-95
2.769764e-94
0
93.557557
TNMD
-11.424248
-11.424248
0.458838
1.950109
441.172775
2.665475e-95
3.142799e-94
0
93.502683
SPON1
-11.067239
-11.067239
0.987076
1.848919
429.450846
9.232904e-93
1.041565e-91
0
90.982313
SLC27A2
-11.491278
-11.491278
0.009423
1.858953
425.456535
6.771301e-92
7.581709e-91
0
90.120233
SLCO5A1
-11.594968
-0.200657
0.874684
2.660573
409.654839
1.793855e-88
1.887085e-87
0
86.724209
VIPR1
-11.305458
-11.305458
0.147051
1.731361
406.139609
1.035733e-87
1.072640e-86
0
85.969546
PLIN2
-10.820458
1.107431
1.439851
2.531766
403.988732
3.028009e-87
3.109084e-86
0
85.507367
PARD3B
-11.203304
-11.203304
0.218544
1.663814
395.953948
1.665512e-85
1.646717e-84
0
83.783381
BEND6
-11.161487
-11.161487
0.314171
1.652496
395.183341
2.446031e-85
2.414450e-84
0
83.617182
ERAP1
-11.363301
-11.363301
-0.299452
1.650015
394.521479
3.402681e-85
3.350483e-84
0
83.474893
GRIA3
-11.241155
-11.241155
0.045862
1.638949
392.317321
1.021499e-84
9.911924e-84
0
83.003842
CNTFR
-11.318841
0.235529
0.836421
2.524169
386.071293
2.301856e-83
2.172106e-82
0
81.663119
SLC10A4
-11.028984
-11.028984
0.385695
1.560395
381.607164
2.132746e-82
1.979842e-81
0
80.703370
CYP27A1
-11.610612
0.093257
-0.242183
2.447008
375.567416
4.335184e-81
3.933123e-80
0
79.405262
CSF1
-11.009662
-11.009662
0.261565
1.495500
372.099518
2.443703e-80
2.187305e-79
0
78.660091
EXOG
-10.969232
-10.969232
-0.029743
1.361055
353.100917
3.178516e-76
2.643917e-75
0
74.577752
NUP62CL
-11.422266
0.052553
-0.540530
2.194061
342.222699
7.205153e-74
5.742593e-73
0
72.240892
PCDHGC3
-11.148685
-0.327086
0.620974
2.102311
328.821400
5.742157e-71
4.315787e-70
0
69.364940
EGR2
-10.869229
0.639334
0.303449
1.997976
324.396475
5.212082e-70
3.826337e-69
0
68.417217
IL17RB
-11.093504
0.061843
0.012494
1.967727
313.503087
1.188802e-67
8.408477e-67
0
66.075283
NEURL2
-10.862253
0.329810
0.390332
1.930991
310.946811
4.250413e-67
2.985237e-66
0
65.525021
IRF1
-11.011890
0.122855
-0.082426
1.890139
306.187842
4.555127e-66
3.145868e-65
0
64.502260
TNFRSF25
-10.973480
0.008988
0.141392
1.822972
300.989375
6.076299e-65
4.132227e-64
0
63.383816
All regulated
Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0' ].isin(all_sign).astype('int' ).tolist()
topGO
All significant
Code
%% R - i allSelected - i allGenes
allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto= 'BP' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_MF <- topGO::annFUN.org(whichOnto= 'MF' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_CC <- topGO::annFUN.org(whichOnto= 'CC' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
selection <- function(allScores){return (as .logical(allScores))}
Code
%% R
#print(lapply(ann_org_BP, count_genes))
GOdata <- new("topGOdata" ,
ontology= "BP" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_BP,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903, 48, 10, 1519], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py
GO.ID
Term
Annotated
Significant
Expected
weight
Scores
0
GO:0045444
fat cell differentiation
170
6
0.75
9.4e-05
0.000094
1
GO:1902895
positive regulation of miRNA transcripti...
39
3
0.17
0.00065
0.000655
2
GO:0007155
cell adhesion
913
15
4.02
0.00074
0.000737
3
GO:0019883
antigen processing and presentation of e...
10
2
0.04
0.00084
0.000835
4
GO:0045944
positive regulation of transcription by ...
873
11
3.84
0.00117
0.001172
...
...
...
...
...
...
...
...
5697
GO:2001256
regulation of store-operated calcium ent...
11
0
0.05
1.00000
1.000000
5698
GO:2001257
regulation of cation channel activity
104
0
0.46
1.00000
1.000000
5699
GO:2001258
negative regulation of cation channel ac...
23
0
0.10
1.00000
1.000000
5700
GO:2001259
positive regulation of cation channel ac...
43
0
0.19
1.00000
1.000000
5701
GO:2001267
regulation of cysteine-type endopeptidas...
13
0
0.06
1.00000
1.000000
5702 rows × 7 columns
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
Code
results_table_py['-log10(pvalue)' ] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated' ] = results_table_py['Significant' ] / results_table_py['Annotated' ]
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx' , save = True )
Code
%% R - i folder
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_BP_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "MF" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_MF,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203, 48, 10, 246], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_MF_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "CC" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_CC,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323, 50, 10, 167], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes:_in_Term_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_CC_genesInTerm_all.xlsx' ), SE = markers)
Reactome
Code
curated = msigdb[msigdb['collection' ].isin(['reactome_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)' ])
rank_copy = rank.copy()
rank_copy['pval' ] = markers.loc[rank.index].FDR
Code
-log10(FDR)
pval
CXCR4
319.933672
0.000000e+00
FGFR3
319.933672
0.000000e+00
ZNF703
319.933672
0.000000e+00
MORC4
319.933672
0.000000e+00
KLF4
319.933672
0.000000e+00
ZYG11A
319.933672
1.165007e-320
VENTX
288.656306
2.206450e-289
WLS
260.377664
4.191180e-261
PLAAT5
229.773136
1.686024e-230
FOSB
167.839978
1.445512e-168
SMAD6
151.777741
1.668241e-152
SFMBT2
148.129316
7.424793e-149
CPEB2
144.979522
1.048281e-145
MYB
129.208892
6.181695e-130
HTR1D
127.184207
6.543250e-128
FLRT2
122.914651
1.217163e-123
POMC
112.901227
1.255375e-113
TGFBR3L
109.807146
1.559027e-110
CD1D
109.306103
4.941932e-110
PLEKHA7
107.205889
6.224597e-108
IRF9
107.174828
6.686079e-108
CUZD1
106.745417
1.797146e-107
CR1L
104.940916
1.145735e-105
CEBPD
102.378224
4.185780e-103
SP8
101.283202
5.209528e-102
CPNE8
97.848413
1.417707e-98
SOX8
97.745194
1.798066e-98
EGR3
93.557557
2.769764e-94
TNMD
93.502683
3.142799e-94
SPON1
90.982313
1.041565e-91
SLC27A2
90.120233
7.581709e-91
SLCO5A1
86.724209
1.887085e-87
VIPR1
85.969546
1.072640e-86
PLIN2
85.507367
3.109084e-86
PARD3B
83.783381
1.646717e-84
BEND6
83.617182
2.414450e-84
ERAP1
83.474893
3.350483e-84
GRIA3
83.003842
9.911924e-84
CNTFR
81.663119
2.172106e-82
SLC10A4
80.703370
1.979842e-81
CYP27A1
79.405262
3.933123e-80
CSF1
78.660091
2.187305e-79
EXOG
74.577752
2.643917e-75
NUP62CL
72.240892
5.742593e-73
PCDHGC3
69.364940
4.315787e-70
EGR2
68.417217
3.826337e-69
IL17RB
66.075283
8.408477e-67
NEURL2
65.525021
2.985237e-66
IRF1
64.502260
3.145868e-65
TNFRSF25
63.383816
4.132227e-64
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
len (results_table_py)
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways' ], rank_copy)
_, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways' ], DEGs = rank_copy, n_top_terms = 10 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx' , save = True )
KEGG
Code
curated = msigdb[msigdb['collection' ].isin(['kegg_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways' ], rank_copy)
_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways' ], rank_copy, n_top_terms = 10 , n_top_genes = 15 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx' , save = True )