1 Load environment

Code
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.setrecursionlimit(20000)

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code
%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

plt.rcParams['svg.fonttype'] = 'none'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code
default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 8

Import R libraries:

Code
%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code
output_folder = './'
folder = './tables/cluster_' + str(N) + '/'

2 Load data

Here we load the dataframe:

Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str(N) + '.xlsx', index_col = 0)
markers
logFC.celltypes_leveledhEGCLC logFC.celltypes_leveledhPGCLC logFC.celltypes_levelediMeLC logCPM LR PValue FDR clusters
DLL1 -11.228872 3.547954 -11.228872 4.575054 1533.970731 0.000000e+00 0.000000e+00 8
ALOX5 -11.178824 4.410238 -11.178824 5.439678 2245.284054 0.000000e+00 0.000000e+00 8
VSTM2B -11.122162 4.797431 -11.122162 5.792537 2511.051934 0.000000e+00 0.000000e+00 8
FGF16 -10.984056 3.985368 -10.984056 4.747048 1708.509557 0.000000e+00 0.000000e+00 8
PSTPIP2 -11.105241 3.087326 -11.105241 4.054911 1143.210227 1.536158e-247 8.738628e-246 8
EGFR -11.738471 2.489754 -11.738471 4.048274 1108.064238 6.479206e-240 3.534996e-238 8
RAI1 -11.129680 2.797734 -11.129680 3.668371 960.524308 6.583281e-208 3.015997e-206 8
C19orf85 -11.388111 2.127670 -11.388111 3.255547 793.364089 1.189597e-171 3.933534e-170 8
MAFB -12.466067 0.498430 -12.466067 3.313987 738.617005 8.873436e-160 2.610487e-158 8
CCDC163 -10.857963 2.563922 -10.857963 2.945123 735.019095 5.349467e-159 1.550958e-157 8
PDE10A -11.175752 2.150874 -11.175752 3.017974 727.061220 2.844354e-157 8.167652e-156 8
LDHD -11.960379 1.040000 -11.960379 2.997689 675.456270 4.404685e-146 1.101447e-144 8
NCALD -10.934065 2.173099 -10.934065 2.769786 665.571840 6.124912e-144 1.494255e-142 8
GRASP -11.670363 1.321981 -11.670363 2.916703 661.788511 4.049534e-143 9.819507e-142 8
GPR157 -11.215989 1.261274 -11.215989 2.453822 556.254469 3.063602e-120 5.423660e-119 8
CNTNAP1 -12.219804 -0.561166 -12.219804 2.444274 534.517747 1.576439e-115 2.602751e-114 8
CEACAM19 -11.120045 1.210802 -11.120045 2.218539 515.532181 2.053606e-111 3.180572e-110 8
NKAIN4 -10.870906 1.448048 -10.870906 2.126111 505.760666 2.692995e-109 4.035458e-108 8
NCR3LG1 -12.075949 -0.763917 -12.075949 2.292377 501.060931 2.810288e-108 4.154173e-107 8
SLC22A5 -11.578276 0.170129 -11.578276 2.132203 474.607993 1.517762e-102 2.042342e-101 8
MYO15B -11.396701 0.537205 -11.396701 2.076268 474.538440 1.571358e-102 2.112095e-101 8
NTRK3 -11.368088 0.347978 -11.368088 1.999594 457.076437 9.549802e-99 1.200275e-97 8
FAM53A -11.332611 0.395974 -11.332611 1.960959 454.251702 3.908746e-98 4.866875e-97 8
HEATR5A -11.094525 0.736528 -11.094525 1.902362 452.504752 9.344365e-98 1.152728e-96 8
DUSP9 -11.376535 0.265029 -11.376535 1.962851 450.926894 2.053157e-97 2.517267e-96 8
ICAM1 -11.814895 -1.465547 -11.814895 1.841416 439.487508 6.178718e-95 7.256669e-94 8
ZNF433 -11.228434 0.396925 -11.228434 1.841026 439.185301 7.184112e-95 8.412771e-94 8
PUDP -10.807817 0.955786 -10.807817 1.727613 434.315764 8.153979e-94 9.365762e-93 8
AKR1E2 -10.936276 0.746199 -10.936276 1.712573 428.947245 1.186971e-92 1.336512e-91 8
SMOC2 -11.450998 -0.291171 -11.450998 1.800132 424.803003 9.381073e-92 1.044536e-90 8
IKZF2 -11.050744 0.516850 -11.050744 1.702677 423.554737 1.748534e-91 1.936130e-90 8
CRYL1 -11.076752 0.406209 -11.076752 1.753451 423.107069 2.186026e-91 2.416103e-90 8
BATF3 -11.071490 0.431127 -11.071490 1.736548 422.716472 2.656274e-91 2.927755e-90 8
ADAM33 -11.528694 -0.870812 -11.528694 1.703740 412.838404 3.665708e-89 3.886881e-88 8
CATSPERG -10.898724 0.530687 -10.898724 1.521126 402.098026 7.775084e-87 7.908842e-86 8
STX1A -11.018889 0.158636 -11.018889 1.527679 393.569782 5.469630e-85 5.354973e-84 8
LYNX1 -11.172865 -0.186924 -11.172865 1.548000 393.068396 7.023559e-85 6.865128e-84 8
HES4 -11.378759 -1.021150 -11.378759 1.538287 390.663797 2.330148e-84 2.246487e-83 8
ANKRD37 -11.149876 -0.330301 -11.149876 1.487399 383.743503 7.349283e-83 6.875561e-82 8
RIPPLY3 -10.987020 0.139181 -10.987020 1.374208 381.036912 2.834293e-82 2.627028e-81 8
CUL9 -10.986253 -0.066411 -10.986253 1.386002 374.538964 7.240039e-81 6.548771e-80 8
CDK3 -10.801797 0.293696 -10.801797 1.329396 372.648605 1.858379e-80 1.667124e-79 8
MUC1 -10.961591 -0.143128 -10.961591 1.344446 367.871679 2.012074e-79 1.781042e-78 8
ARIH2OS -10.922176 -0.081686 -10.922176 1.322430 366.152519 4.741798e-79 4.163555e-78 8
ERMAP -10.898001 -0.397724 -10.898001 1.211818 351.049294 8.840368e-76 7.307915e-75 8
TRIM72 -10.698973 -0.353230 -10.698973 1.014147 331.780927 1.313276e-71 9.976739e-71 8
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv')
allGenes = allGenes_series['0'].tolist()

Here we load the dictionary that associates to each GO term its genes:

Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code

markers = markers[markers.FDR < 0.01]
markers['-log10(FDR)'] = -np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers
logFC.celltypes_leveledhEGCLC logFC.celltypes_leveledhPGCLC logFC.celltypes_levelediMeLC logCPM LR PValue FDR clusters -log10(FDR)
DLL1 -11.228872 3.547954 -11.228872 4.575054 1533.970731 0.000000e+00 0.000000e+00 8 245.058557
ALOX5 -11.178824 4.410238 -11.178824 5.439678 2245.284054 0.000000e+00 0.000000e+00 8 245.058557
VSTM2B -11.122162 4.797431 -11.122162 5.792537 2511.051934 0.000000e+00 0.000000e+00 8 245.058557
FGF16 -10.984056 3.985368 -10.984056 4.747048 1708.509557 0.000000e+00 0.000000e+00 8 245.058557
PSTPIP2 -11.105241 3.087326 -11.105241 4.054911 1143.210227 1.536158e-247 8.738628e-246 8 245.058557
EGFR -11.738471 2.489754 -11.738471 4.048274 1108.064238 6.479206e-240 3.534996e-238 8 237.451611
RAI1 -11.129680 2.797734 -11.129680 3.668371 960.524308 6.583281e-208 3.015997e-206 8 205.520569
C19orf85 -11.388111 2.127670 -11.388111 3.255547 793.364089 1.189597e-171 3.933534e-170 8 169.405217
MAFB -12.466067 0.498430 -12.466067 3.313987 738.617005 8.873436e-160 2.610487e-158 8 157.583279
CCDC163 -10.857963 2.563922 -10.857963 2.945123 735.019095 5.349467e-159 1.550958e-157 8 156.809400
PDE10A -11.175752 2.150874 -11.175752 3.017974 727.061220 2.844354e-157 8.167652e-156 8 155.087903
LDHD -11.960379 1.040000 -11.960379 2.997689 675.456270 4.404685e-146 1.101447e-144 8 143.958037
NCALD -10.934065 2.173099 -10.934065 2.769786 665.571840 6.124912e-144 1.494255e-142 8 141.825575
GRASP -11.670363 1.321981 -11.670363 2.916703 661.788511 4.049534e-143 9.819507e-142 8 141.007910
GPR157 -11.215989 1.261274 -11.215989 2.453822 556.254469 3.063602e-120 5.423660e-119 8 118.265708
CNTNAP1 -12.219804 -0.561166 -12.219804 2.444274 534.517747 1.576439e-115 2.602751e-114 8 113.584567
CEACAM19 -11.120045 1.210802 -11.120045 2.218539 515.532181 2.053606e-111 3.180572e-110 8 109.497495
NKAIN4 -10.870906 1.448048 -10.870906 2.126111 505.760666 2.692995e-109 4.035458e-108 8 107.394107
NCR3LG1 -12.075949 -0.763917 -12.075949 2.292377 501.060931 2.810288e-108 4.154173e-107 8 106.381515
SLC22A5 -11.578276 0.170129 -11.578276 2.132203 474.607993 1.517762e-102 2.042342e-101 8 100.689871
MYO15B -11.396701 0.537205 -11.396701 2.076268 474.538440 1.571358e-102 2.112095e-101 8 100.675286
NTRK3 -11.368088 0.347978 -11.368088 1.999594 457.076437 9.549802e-99 1.200275e-97 8 96.920719
FAM53A -11.332611 0.395974 -11.332611 1.960959 454.251702 3.908746e-98 4.866875e-97 8 96.312750
HEATR5A -11.094525 0.736528 -11.094525 1.902362 452.504752 9.344365e-98 1.152728e-96 8 95.938273
DUSP9 -11.376535 0.265029 -11.376535 1.962851 450.926894 2.053157e-97 2.517267e-96 8 95.599071
ICAM1 -11.814895 -1.465547 -11.814895 1.841416 439.487508 6.178718e-95 7.256669e-94 8 93.139263
ZNF433 -11.228434 0.396925 -11.228434 1.841026 439.185301 7.184112e-95 8.412771e-94 8 93.075061
PUDP -10.807817 0.955786 -10.807817 1.727613 434.315764 8.153979e-94 9.365762e-93 8 92.028457
AKR1E2 -10.936276 0.746199 -10.936276 1.712573 428.947245 1.186971e-92 1.336512e-91 8 90.874027
SMOC2 -11.450998 -0.291171 -11.450998 1.800132 424.803003 9.381073e-92 1.044536e-90 8 89.981076
IKZF2 -11.050744 0.516850 -11.050744 1.702677 423.554737 1.748534e-91 1.936130e-90 8 89.713065
CRYL1 -11.076752 0.406209 -11.076752 1.753451 423.107069 2.186026e-91 2.416103e-90 8 89.616885
BATF3 -11.071490 0.431127 -11.071490 1.736548 422.716472 2.656274e-91 2.927755e-90 8 89.533465
ADAM33 -11.528694 -0.870812 -11.528694 1.703740 412.838404 3.665708e-89 3.886881e-88 8 87.410399
CATSPERG -10.898724 0.530687 -10.898724 1.521126 402.098026 7.775084e-87 7.908842e-86 8 85.101887
STX1A -11.018889 0.158636 -11.018889 1.527679 393.569782 5.469630e-85 5.354973e-84 8 83.271243
LYNX1 -11.172865 -0.186924 -11.172865 1.548000 393.068396 7.023559e-85 6.865128e-84 8 83.163351
HES4 -11.378759 -1.021150 -11.378759 1.538287 390.663797 2.330148e-84 2.246487e-83 8 82.648496
ANKRD37 -11.149876 -0.330301 -11.149876 1.487399 383.743503 7.349283e-83 6.875561e-82 8 81.162692
RIPPLY3 -10.987020 0.139181 -10.987020 1.374208 381.036912 2.834293e-82 2.627028e-81 8 80.580535
CUL9 -10.986253 -0.066411 -10.986253 1.386002 374.538964 7.240039e-81 6.548771e-80 8 79.183840
CDK3 -10.801797 0.293696 -10.801797 1.329396 372.648605 1.858379e-80 1.667124e-79 8 78.778032
MUC1 -10.961591 -0.143128 -10.961591 1.344446 367.871679 2.012074e-79 1.781042e-78 8 77.749326
ARIH2OS -10.922176 -0.081686 -10.922176 1.322430 366.152519 4.741798e-79 4.163555e-78 8 77.380536
ERMAP -10.898001 -0.397724 -10.898001 1.211818 351.049294 8.840368e-76 7.307915e-75 8 74.136206
TRIM72 -10.698973 -0.353230 -10.698973 1.014147 331.780927 1.313276e-71 9.976739e-71 8 70.001011

3.0.1 All regulated

Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0'].isin(all_sign).astype('int').tolist()

4 topGO

4.1 All significant

Code
%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}
Code
%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903,    38,    10,  1350], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py
GO.ID Term Annotated Significant Expected weight Scores
0 GO:0048665 neuron fate specification 13 2 0.05 0.0009 0.000901
1 GO:0061042 vascular wound healing 13 2 0.05 0.0009 0.000901
2 GO:1900746 regulation of vascular endothelial growt... 15 2 0.05 0.0012 0.001207
3 GO:2001028 positive regulation of endothelial cell ... 15 2 0.05 0.0012 0.001207
4 GO:0051154 negative regulation of striated muscle c... 24 2 0.08 0.0031 0.003110
... ... ... ... ... ... ... ...
5697 GO:2001256 regulation of store-operated calcium ent... 11 0 0.04 1.0000 1.000000
5698 GO:2001257 regulation of cation channel activity 104 0 0.36 1.0000 1.000000
5699 GO:2001258 negative regulation of cation channel ac... 23 0 0.08 1.0000 1.000000
5700 GO:2001259 positive regulation of cation channel ac... 43 0 0.15 1.0000 1.000000
5701 GO:2001267 regulation of cysteine-type endopeptidas... 13 0 0.05 1.0000 1.000000

5702 rows × 7 columns

Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]
Code
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)
Code
%%R -i folder
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')
ggsave(file=paste0(folder, "TopGO_results_BP.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_BP.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203,    37,    10,   189], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_MF.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_MF.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323,    43,    10,   202], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_CC.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes:_in_Term_results_CC.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.1.0.1 Reactome

Code
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR
Code
rank_copy
-log10(FDR) pval
DLL1 245.058557 0.000000e+00
ALOX5 245.058557 0.000000e+00
VSTM2B 245.058557 0.000000e+00
FGF16 245.058557 0.000000e+00
PSTPIP2 245.058557 8.738628e-246
EGFR 237.451611 3.534996e-238
RAI1 205.520569 3.015997e-206
C19orf85 169.405217 3.933534e-170
MAFB 157.583279 2.610487e-158
CCDC163 156.809400 1.550958e-157
PDE10A 155.087903 8.167652e-156
LDHD 143.958037 1.101447e-144
NCALD 141.825575 1.494255e-142
GRASP 141.007910 9.819507e-142
GPR157 118.265708 5.423660e-119
CNTNAP1 113.584567 2.602751e-114
CEACAM19 109.497495 3.180572e-110
NKAIN4 107.394107 4.035458e-108
NCR3LG1 106.381515 4.154173e-107
SLC22A5 100.689871 2.042342e-101
MYO15B 100.675286 2.112095e-101
NTRK3 96.920719 1.200275e-97
FAM53A 96.312750 4.866875e-97
HEATR5A 95.938273 1.152728e-96
DUSP9 95.599071 2.517267e-96
ICAM1 93.139263 7.256669e-94
ZNF433 93.075061 8.412771e-94
PUDP 92.028457 9.365762e-93
AKR1E2 90.874027 1.336512e-91
SMOC2 89.981076 1.044536e-90
IKZF2 89.713065 1.936130e-90
CRYL1 89.616885 2.416103e-90
BATF3 89.533465 2.927755e-90
ADAM33 87.410399 3.886881e-88
CATSPERG 85.101887 7.908842e-86
STX1A 83.271243 5.354973e-84
LYNX1 83.163351 6.865128e-84
HES4 82.648496 2.246487e-83
ANKRD37 81.162692 6.875561e-82
RIPPLY3 80.580535 2.627028e-81
CUL9 79.183840 6.548771e-80
CDK3 78.778032 1.667124e-79
MUC1 77.749326 1.781042e-78
ARIH2OS 77.380536 4.163555e-78
ERMAP 74.136206 7.307915e-75
TRIM72 70.001011 9.976739e-71
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)
No significant term was found
0
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)
Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.1.0.2 KEGG

Code
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)
Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)