Load environment
Code
import warnings
warnings.filterwarnings("ignore" )
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools
from tqdm import tqdm
import decoupler as dc
import sys
sys.setrecursionlimit(20000 )
sys.path.append("./../../../../utilities_folder" )
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors
Set R environment with rpy2:
Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
% load_ext rpy2.ipython
Set up of graphical parameters for Python plots:
Code
% matplotlib inline
sc.set_figure_params(dpi = 300 , fontsize = 20 )
plt.rcParams['svg.fonttype' ] = 'none'
cmap_up = sns.light_palette("red" , as_cmap= True )
cmap_down = sns.light_palette("blue" , as_cmap= True )
cmap_all = sns.light_palette("seagreen" , as_cmap= True )
Set up of graphical parameters for R plots:
Code
default_units = 'in'
default_res = 300
default_width = 10
default_height = 9
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self , args):
if getattr (args, 'units' ) is not None :
if args.units != default_units:
return old_setup_graphics(self , args)
args.units = default_units
if getattr (args, 'res' ) is None :
args.res = default_res
if getattr (args, 'width' ) is None :
args.width = default_width
if getattr (args, 'height' ) is None :
args.height = default_height
return old_setup_graphics(self , args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
Here the cell were we inject the parameters using Quarto renderer:
Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 14
Import R libraries:
Code
%% R
source('./../../../../utilities_folder/GO_helper.r' )
loc <- './../../../../R_loc' # pointing to the renv environment
.libPaths(loc)
library('topGO' )
library('org.Hs.eg.db' )
library(dplyr)
library(ggplot2)
Set output folders:
Code
output_folder = './'
folder = './tables/cluster_' + str (N) + '/'
Load data
Here we load the dataframe:
Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str (N) + '.xlsx' , index_col = 0 )
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
KRTDAP
-12.016842
-12.016842
-12.016842
1.785090
554.411579
7.685864e-120
1.352689e-118
14
PVALB
-11.661893
-11.661893
-11.661893
1.446665
492.726668
1.798396e-106
2.575912e-105
14
CYGB
-11.590367
-11.590367
-11.590367
1.388522
481.503036
4.865001e-104
6.712024e-103
14
BCHE
-11.552081
-11.552081
-11.552081
1.338340
474.309595
1.761424e-102
2.362276e-101
14
NFIA
-11.543503
-11.543503
-11.543503
1.338488
473.452072
2.701975e-102
3.599535e-101
14
NPY5R
-11.522267
-11.522267
-11.522267
1.332806
470.944021
9.443850e-102
1.244287e-100
14
NPW
-11.516360
-11.516360
-11.516360
1.313345
469.078150
2.395850e-101
3.136029e-100
14
GNMT
-11.505465
-11.505465
-11.505465
1.299169
467.082914
6.483327e-101
8.431134e-100
14
FIBCD1
-11.444936
-11.444936
-11.444936
1.242440
458.622018
4.416779e-99
5.568760e-98
14
CCDC78
-11.407576
-11.407576
-11.407576
1.215570
453.788973
4.923766e-98
6.105368e-97
14
AURKC
-11.389224
-11.389224
-11.389224
1.180732
450.515009
2.521530e-97
3.085212e-96
14
BHLHE41
-11.355354
-11.355354
-11.355354
1.165658
446.563530
1.810559e-96
2.181941e-95
14
SLC16A12
-11.336495
-11.336495
-11.336495
1.159348
444.463544
5.161781e-96
6.164862e-95
14
FAM3B
-11.253109
-11.253109
-11.253109
1.081410
433.167816
1.445677e-93
1.652615e-92
14
SLC30A3
-11.240689
-11.240689
-11.240689
1.059236
431.063008
4.131157e-93
4.700121e-92
14
CA7
-11.213118
-11.213118
-11.213118
1.024725
427.018972
3.105887e-92
3.490633e-91
14
C1QTNF6
-11.175776
-11.175776
-11.175776
0.989791
422.055403
3.693889e-91
4.056519e-90
14
DDX60
-11.165692
-11.165692
-11.165692
0.990941
421.148868
5.805916e-91
6.341075e-90
14
MCOLN2
-11.165931
-11.165931
-11.165931
0.983494
420.869763
6.673202e-91
7.275063e-90
14
KCNQ4
-11.157494
-11.157494
-11.157494
0.975465
419.749288
1.166987e-90
1.271084e-89
14
CST6
-11.144914
-11.144914
-11.144914
0.968273
418.274329
2.435504e-90
2.640772e-89
14
ALOX12
-11.132350
-11.132350
-11.132350
0.948100
416.282664
6.577212e-90
7.074039e-89
14
REXO5
-11.126199
-11.126199
-11.126199
0.942958
415.499184
9.722216e-90
1.042858e-88
14
HRH3
-11.115619
-11.115619
-11.115619
0.952246
414.882838
1.322155e-89
1.414423e-88
14
ARMC4
-11.105176
-11.105176
-11.105176
0.931578
413.075683
3.256548e-89
3.459146e-88
14
HRC
-11.092823
-11.092823
-11.092823
0.911069
411.104743
8.703784e-89
9.196437e-88
14
DHDH
-11.091171
-11.091171
-11.091171
0.909455
410.886547
9.704519e-89
1.024480e-87
14
RASGRP1
-11.083438
-11.083438
-11.083438
0.917490
410.487358
1.184260e-88
1.247996e-87
14
SHISA6
-11.054033
-11.054033
-11.054033
0.888521
406.606404
8.206021e-88
8.513126e-87
14
MCTP2
-11.056345
-11.056345
-11.056345
0.872171
406.173908
1.018164e-87
1.055356e-86
14
CLIC6
-11.041824
-11.041824
-11.041824
0.879920
405.137714
1.707153e-87
1.758881e-86
14
CFP
-11.040070
-11.040070
-11.040070
0.846821
403.674026
3.542634e-87
3.634379e-86
14
DPP4
-11.026833
-11.026833
-11.026833
0.861772
403.037524
4.866312e-87
4.971093e-86
14
CDKL2
-11.028050
-11.028050
-11.028050
0.849949
402.683933
5.804861e-87
5.914749e-86
14
HCLS1
-11.023099
-11.023099
-11.023099
0.819798
401.057680
1.306332e-86
1.323199e-85
14
HMX2
-11.003578
-11.003578
-11.003578
0.865132
401.044914
1.314676e-86
1.330528e-85
14
GRIN1
-11.013697
-11.013697
-11.013697
0.837658
400.880668
1.426908e-86
1.442896e-85
14
FBXL2
-11.009009
-11.009009
-11.009009
0.848432
400.874333
1.431424e-86
1.446244e-85
14
ALS2CL
-11.013210
-11.013210
-11.013210
0.834624
400.717361
1.547995e-86
1.560083e-85
14
MICB
-11.008766
-11.008766
-11.008766
0.831651
400.192631
2.011084e-86
2.023390e-85
14
KRBOX4
-11.008902
-11.008902
-11.008902
0.826818
400.016898
2.195307e-86
2.206890e-85
14
GUCA1B
-10.986123
-10.986123
-10.986123
0.805985
397.116645
9.326200e-86
9.259089e-85
14
AZU1
-10.955768
-10.955768
-10.955768
0.765337
392.777428
8.120450e-85
7.917933e-84
14
ADAMTS3
-10.945374
-10.945374
-10.945374
0.789280
392.756470
8.205772e-85
7.994633e-84
14
ADORA1
-10.920705
-10.920705
-10.920705
0.773319
389.907541
3.397690e-84
3.249600e-83
14
PLA2G4A
-10.924555
-10.924555
-10.924555
0.751729
389.418145
4.336939e-84
4.134733e-83
14
TMEM249
-10.830970
-10.830970
-10.830970
0.663328
377.661781
1.525539e-81
1.398857e-80
14
AL135905.2
-10.811029
-10.811029
-10.811029
0.637471
374.921455
5.982800e-81
5.419739e-80
14
CCT6B
-10.729469
-10.729469
-10.729469
0.560234
364.884183
8.925023e-79
7.808094e-78
14
NPAS4
-10.669784
-10.669784
-10.669784
0.486103
357.006658
4.533972e-77
3.835185e-76
14
IL20RB
-10.613298
-10.613298
-10.613298
0.439829
350.493393
1.166381e-75
9.622046e-75
14
KCNJ14
-10.584749
-10.584749
-10.584749
0.409276
346.975703
6.737820e-75
5.482987e-74
14
FUT5
-10.539339
-10.539339
-10.539339
0.366784
341.637204
9.647441e-74
7.673839e-73
14
SPINK1
-10.454414
-10.454414
-10.454414
0.276592
331.417874
1.573822e-71
1.194851e-70
14
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv' )
allGenes = allGenes_series['0' ].tolist()
Here we load the dictionary that associates to each GO term its genes:
Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle' )
Markers of cluster
We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):
Code
markers = markers[markers.FDR < 0.01 ]
markers['-log10(FDR)' ] = - np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)' ] != np.inf]['-log10(FDR)' ].max ())
markers
logFC.celltypes_leveledhEGCLC
logFC.celltypes_leveledhPGCLC
logFC.celltypes_levelediMeLC
logCPM
LR
PValue
FDR
clusters
-log10(FDR)
KRTDAP
-12.016842
-12.016842
-12.016842
1.785090
554.411579
7.685864e-120
1.352689e-118
14
117.868802
PVALB
-11.661893
-11.661893
-11.661893
1.446665
492.726668
1.798396e-106
2.575912e-105
14
104.589069
CYGB
-11.590367
-11.590367
-11.590367
1.388522
481.503036
4.865001e-104
6.712024e-103
14
102.173146
BCHE
-11.552081
-11.552081
-11.552081
1.338340
474.309595
1.761424e-102
2.362276e-101
14
100.626669
NFIA
-11.543503
-11.543503
-11.543503
1.338488
473.452072
2.701975e-102
3.599535e-101
14
100.443754
NPY5R
-11.522267
-11.522267
-11.522267
1.332806
470.944021
9.443850e-102
1.244287e-100
14
99.905080
NPW
-11.516360
-11.516360
-11.516360
1.313345
469.078150
2.395850e-101
3.136029e-100
14
99.503620
GNMT
-11.505465
-11.505465
-11.505465
1.299169
467.082914
6.483327e-101
8.431134e-100
14
99.074114
FIBCD1
-11.444936
-11.444936
-11.444936
1.242440
458.622018
4.416779e-99
5.568760e-98
14
97.254242
CCDC78
-11.407576
-11.407576
-11.407576
1.215570
453.788973
4.923766e-98
6.105368e-97
14
96.214288
AURKC
-11.389224
-11.389224
-11.389224
1.180732
450.515009
2.521530e-97
3.085212e-96
14
95.510715
BHLHE41
-11.355354
-11.355354
-11.355354
1.165658
446.563530
1.810559e-96
2.181941e-95
14
94.661157
SLC16A12
-11.336495
-11.336495
-11.336495
1.159348
444.463544
5.161781e-96
6.164862e-95
14
94.210077
FAM3B
-11.253109
-11.253109
-11.253109
1.081410
433.167816
1.445677e-93
1.652615e-92
14
91.781828
SLC30A3
-11.240689
-11.240689
-11.240689
1.059236
431.063008
4.131157e-93
4.700121e-92
14
91.327891
CA7
-11.213118
-11.213118
-11.213118
1.024725
427.018972
3.105887e-92
3.490633e-91
14
90.457096
C1QTNF6
-11.175776
-11.175776
-11.175776
0.989791
422.055403
3.693889e-91
4.056519e-90
14
89.391846
DDX60
-11.165692
-11.165692
-11.165692
0.990941
421.148868
5.805916e-91
6.341075e-90
14
89.197837
MCOLN2
-11.165931
-11.165931
-11.165931
0.983494
420.869763
6.673202e-91
7.275063e-90
14
89.138163
KCNQ4
-11.157494
-11.157494
-11.157494
0.975465
419.749288
1.166987e-90
1.271084e-89
14
88.895826
CST6
-11.144914
-11.144914
-11.144914
0.968273
418.274329
2.435504e-90
2.640772e-89
14
88.578269
ALOX12
-11.132350
-11.132350
-11.132350
0.948100
416.282664
6.577212e-90
7.074039e-89
14
88.150333
REXO5
-11.126199
-11.126199
-11.126199
0.942958
415.499184
9.722216e-90
1.042858e-88
14
87.981775
HRH3
-11.115619
-11.115619
-11.115619
0.952246
414.882838
1.322155e-89
1.414423e-88
14
87.849421
ARMC4
-11.105176
-11.105176
-11.105176
0.931578
413.075683
3.256548e-89
3.459146e-88
14
87.461031
HRC
-11.092823
-11.092823
-11.092823
0.911069
411.104743
8.703784e-89
9.196437e-88
14
87.036380
DHDH
-11.091171
-11.091171
-11.091171
0.909455
410.886547
9.704519e-89
1.024480e-87
14
86.989497
RASGRP1
-11.083438
-11.083438
-11.083438
0.917490
410.487358
1.184260e-88
1.247996e-87
14
86.903787
SHISA6
-11.054033
-11.054033
-11.054033
0.888521
406.606404
8.206021e-88
8.513126e-87
14
86.069911
MCTP2
-11.056345
-11.056345
-11.056345
0.872171
406.173908
1.018164e-87
1.055356e-86
14
85.976601
CLIC6
-11.041824
-11.041824
-11.041824
0.879920
405.137714
1.707153e-87
1.758881e-86
14
85.754764
CFP
-11.040070
-11.040070
-11.040070
0.846821
403.674026
3.542634e-87
3.634379e-86
14
85.439570
DPP4
-11.026833
-11.026833
-11.026833
0.861772
403.037524
4.866312e-87
4.971093e-86
14
85.303548
CDKL2
-11.028050
-11.028050
-11.028050
0.849949
402.683933
5.804861e-87
5.914749e-86
14
85.228064
HCLS1
-11.023099
-11.023099
-11.023099
0.819798
401.057680
1.306332e-86
1.323199e-85
14
84.878375
HMX2
-11.003578
-11.003578
-11.003578
0.865132
401.044914
1.314676e-86
1.330528e-85
14
84.875976
GRIN1
-11.013697
-11.013697
-11.013697
0.837658
400.880668
1.426908e-86
1.442896e-85
14
84.840765
FBXL2
-11.009009
-11.009009
-11.009009
0.848432
400.874333
1.431424e-86
1.446244e-85
14
84.839758
ALS2CL
-11.013210
-11.013210
-11.013210
0.834624
400.717361
1.547995e-86
1.560083e-85
14
84.806852
MICB
-11.008766
-11.008766
-11.008766
0.831651
400.192631
2.011084e-86
2.023390e-85
14
84.693920
KRBOX4
-11.008902
-11.008902
-11.008902
0.826818
400.016898
2.195307e-86
2.206890e-85
14
84.656219
GUCA1B
-10.986123
-10.986123
-10.986123
0.805985
397.116645
9.326200e-86
9.259089e-85
14
84.033432
AZU1
-10.955768
-10.955768
-10.955768
0.765337
392.777428
8.120450e-85
7.917933e-84
14
83.101388
ADAMTS3
-10.945374
-10.945374
-10.945374
0.789280
392.756470
8.205772e-85
7.994633e-84
14
83.097201
ADORA1
-10.920705
-10.920705
-10.920705
0.773319
389.907541
3.397690e-84
3.249600e-83
14
82.488170
PLA2G4A
-10.924555
-10.924555
-10.924555
0.751729
389.418145
4.336939e-84
4.134733e-83
14
82.383553
TMEM249
-10.830970
-10.830970
-10.830970
0.663328
377.661781
1.525539e-81
1.398857e-80
14
79.854227
AL135905.2
-10.811029
-10.811029
-10.811029
0.637471
374.921455
5.982800e-81
5.419739e-80
14
79.266022
CCT6B
-10.729469
-10.729469
-10.729469
0.560234
364.884183
8.925023e-79
7.808094e-78
14
77.107455
NPAS4
-10.669784
-10.669784
-10.669784
0.486103
357.006658
4.533972e-77
3.835185e-76
14
75.416214
IL20RB
-10.613298
-10.613298
-10.613298
0.439829
350.493393
1.166381e-75
9.622046e-75
14
74.016733
KCNJ14
-10.584749
-10.584749
-10.584749
0.409276
346.975703
6.737820e-75
5.482987e-74
14
73.260983
FUT5
-10.539339
-10.539339
-10.539339
0.366784
341.637204
9.647441e-74
7.673839e-73
14
72.114987
SPINK1
-10.454414
-10.454414
-10.454414
0.276592
331.417874
1.573822e-71
1.194851e-70
14
69.922686
All regulated
Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0' ].isin(all_sign).astype('int' ).tolist()
topGO
All significant
Code
%% R - i allSelected - i allGenes
allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto= 'BP' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_MF <- topGO::annFUN.org(whichOnto= 'MF' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
ann_org_CC <- topGO::annFUN.org(whichOnto= 'CC' , feasibleGenes= names(allGenes_v),
mapping= 'org.Hs.eg' , ID= 'symbol' )
selection <- function(allScores){return (as .logical(allScores))}
Code
%% R
#print(lapply(ann_org_BP, count_genes))
GOdata <- new("topGOdata" ,
ontology= "BP" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_BP,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903, 50, 10, 1556], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py
GO.ID
Term
Annotated
Significant
Expected
weight
Scores
0
GO:0051956
negative regulation of amino acid transp...
10
3
0.05
1.1e-05
0.000011
1
GO:0014048
regulation of glutamate secretion
13
3
0.06
2.5e-05
0.000025
2
GO:1903792
negative regulation of anion transport
14
3
0.06
3.2e-05
0.000032
3
GO:0032228
regulation of synaptic transmission, GAB...
25
4
0.11
3.7e-05
0.000037
4
GO:0002673
regulation of acute inflammatory respons...
18
3
0.08
7.1e-05
0.000071
...
...
...
...
...
...
...
...
5697
GO:2001251
negative regulation of chromosome organi...
88
0
0.40
1.00000
1.000000
5698
GO:2001252
positive regulation of chromosome organi...
93
0
0.43
1.00000
1.000000
5699
GO:2001258
negative regulation of cation channel ac...
23
0
0.11
1.00000
1.000000
5700
GO:2001259
positive regulation of cation channel ac...
43
0
0.20
1.00000
1.000000
5701
GO:2001267
regulation of cysteine-type endopeptidas...
13
0
0.06
1.00000
1.000000
5702 rows × 7 columns
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
Code
results_table_py['-log10(pvalue)' ] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated' ] = results_table_py['Significant' ] / results_table_py['Annotated' ]
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx' , save = True )
Code
%% R - i folder
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'BP' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_BP.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_BP_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "MF" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_MF,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203, 49, 10, 260], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'MF' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes_in_Term_results_MF.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 15 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_MF_genesInTerm_all.xlsx' ), SE = markers)
Code
%% R
GOdata <- new("topGOdata" ,
ontology= "CC" ,
allGenes= allGenes_v,
annot= annFUN.GO2genes,
GO2genes= ann_org_CC,
geneSel = selection,
nodeSize= 10 )
Code
%% R - o results
results <- runTest(GOdata, algorithm= "weight01" ,statistic= "fisher" )
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata
genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323, 50, 10, 250], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight= results,
orderBy= "weight" , topNodes= len (scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores' : scores_py, 'GO.ID' : score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID' , right_on = 'GO.ID' )
Code
results_table_py = results_table_py[results_table_py['Scores' ] < 0.05 ]
results_table_py = results_table_py[results_table_py['Annotated' ] < 200 ]
results_table_py = results_table_py[results_table_py['Annotated' ] > 15 ]
intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx' , save = True )
Code
%% R
Res <- GenTable(GOdata, weight= results,
orderBy= "weight" , topNodes= length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID" , "Term" , "Annotated" , "Significant" , "Expected" , "Statistics" )
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
ggsave(file = paste0(folder, "TopGO_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
bubbleplot(Res, Ont = 'CC' , fillCol = 'forestgreen' )
Code
%% R - i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
ggsave(file = paste0(folder, "Genes:_in_Term_results_CC.pdf" ), plot= image, width= 12 , height= 4 )
plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms= 12 , ngenes= 12 ,
fillCol= 'forestgreen' , log = TRUE)
Code
%% R - i markers - i folder
saveGenesInTerm(Res, GOdata, nterms = 20 , path = paste0(folder,'GO_CC_genesInTerm_all.xlsx' ), SE = markers)
Reactome
Code
curated = msigdb[msigdb['collection' ].isin(['reactome_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)' ])
rank_copy = rank.copy()
rank_copy['pval' ] = markers.loc[rank.index].FDR
Code
-log10(FDR)
pval
KRTDAP
117.868802
1.352689e-118
PVALB
104.589069
2.575912e-105
CYGB
102.173146
6.712024e-103
BCHE
100.626669
2.362276e-101
NFIA
100.443754
3.599535e-101
NPY5R
99.905080
1.244287e-100
NPW
99.503620
3.136029e-100
GNMT
99.074114
8.431134e-100
FIBCD1
97.254242
5.568760e-98
CCDC78
96.214288
6.105368e-97
AURKC
95.510715
3.085212e-96
BHLHE41
94.661157
2.181941e-95
SLC16A12
94.210077
6.164862e-95
FAM3B
91.781828
1.652615e-92
SLC30A3
91.327891
4.700121e-92
CA7
90.457096
3.490633e-91
C1QTNF6
89.391846
4.056519e-90
DDX60
89.197837
6.341075e-90
MCOLN2
89.138163
7.275063e-90
KCNQ4
88.895826
1.271084e-89
CST6
88.578269
2.640772e-89
ALOX12
88.150333
7.074039e-89
REXO5
87.981775
1.042858e-88
HRH3
87.849421
1.414423e-88
ARMC4
87.461031
3.459146e-88
HRC
87.036380
9.196437e-88
DHDH
86.989497
1.024480e-87
RASGRP1
86.903787
1.247996e-87
SHISA6
86.069911
8.513126e-87
MCTP2
85.976601
1.055356e-86
CLIC6
85.754764
1.758881e-86
CFP
85.439570
3.634379e-86
DPP4
85.303548
4.971093e-86
CDKL2
85.228064
5.914749e-86
HCLS1
84.878375
1.323199e-85
HMX2
84.875976
1.330528e-85
GRIN1
84.840765
1.442896e-85
FBXL2
84.839758
1.446244e-85
ALS2CL
84.806852
1.560083e-85
MICB
84.693920
2.023390e-85
KRBOX4
84.656219
2.206890e-85
GUCA1B
84.033432
9.259089e-85
AZU1
83.101388
7.917933e-84
ADAMTS3
83.097201
7.994633e-84
ADORA1
82.488170
3.249600e-83
PLA2G4A
82.383553
4.134733e-83
TMEM249
79.854227
1.398857e-80
AL135905.2
79.266022
5.419739e-80
CCT6B
77.107455
7.808094e-78
NPAS4
75.416214
3.835185e-76
IL20RB
74.016733
9.622046e-75
KCNJ14
73.260983
5.482987e-74
FUT5
72.114987
7.673839e-73
SPINK1
69.922686
1.194851e-70
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
len (results_table_py)
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways' ], rank_copy)
_, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways' ], DEGs = rank_copy, n_top_terms = 10 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx' , save = True )
KEGG
Code
curated = msigdb[msigdb['collection' ].isin(['kegg_pathways' ])]
curated = curated[~ curated.duplicated(['geneset' , 'genesymbol' ])]
aggregated = curated[["geneset" , "genesymbol" ]].groupby("geneset" ).count().rename(columns= {"genesymbol" : "gene_count" })
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count > 200 ].index.tolist())].copy()
curated = curated[~ curated.geneset.isin(aggregated[aggregated.gene_count < 15 ].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat= rank.T, net= curated, source= 'geneset' , target= 'genesymbol' , verbose= False , n_up= len (rank), n_bottom= 0 )
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx' , save = True )
Code
if len (results_table_py) > 0 :
results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways' ], rank_copy)
_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways' ], rank_copy, n_top_terms = 10 , n_top_genes = 15 , cmap = cmap_all)
Code
if len (results_table_py) > 0 :
intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx' , save = True )