1 Load environment

Code

import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.setrecursionlimit(20000)

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code

%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

plt.rcParams['svg.fonttype'] = 'none'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code

default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code

# Injected Parameters
N = 3

Code

# Injected Parameters
N = 8

Import R libraries:

Code

%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code

output_folder = './'
folder = './tables/cluster_' + str(N) + '/'

2 Load data

Here we load the dataframe:

Code

markers = pd.read_excel(folder + 'genes_in_cluster_' + str(N) + '.xlsx', index_col = 0)
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters
DLL1	-11.228872	3.547954	-11.228872	4.575054	1533.970731	0.000000e+00	0.000000e+00	8
ALOX5	-11.178824	4.410238	-11.178824	5.439678	2245.284054	0.000000e+00	0.000000e+00	8
VSTM2B	-11.122162	4.797431	-11.122162	5.792537	2511.051934	0.000000e+00	0.000000e+00	8
FGF16	-10.984056	3.985368	-10.984056	4.747048	1708.509557	0.000000e+00	0.000000e+00	8
PSTPIP2	-11.105241	3.087326	-11.105241	4.054911	1143.210227	1.536158e-247	8.738628e-246	8
EGFR	-11.738471	2.489754	-11.738471	4.048274	1108.064238	6.479206e-240	3.534996e-238	8
RAI1	-11.129680	2.797734	-11.129680	3.668371	960.524308	6.583281e-208	3.015997e-206	8
C19orf85	-11.388111	2.127670	-11.388111	3.255547	793.364089	1.189597e-171	3.933534e-170	8
MAFB	-12.466067	0.498430	-12.466067	3.313987	738.617005	8.873436e-160	2.610487e-158	8
CCDC163	-10.857963	2.563922	-10.857963	2.945123	735.019095	5.349467e-159	1.550958e-157	8
PDE10A	-11.175752	2.150874	-11.175752	3.017974	727.061220	2.844354e-157	8.167652e-156	8
LDHD	-11.960379	1.040000	-11.960379	2.997689	675.456270	4.404685e-146	1.101447e-144	8
NCALD	-10.934065	2.173099	-10.934065	2.769786	665.571840	6.124912e-144	1.494255e-142	8
GRASP	-11.670363	1.321981	-11.670363	2.916703	661.788511	4.049534e-143	9.819507e-142	8
GPR157	-11.215989	1.261274	-11.215989	2.453822	556.254469	3.063602e-120	5.423660e-119	8
CNTNAP1	-12.219804	-0.561166	-12.219804	2.444274	534.517747	1.576439e-115	2.602751e-114	8
CEACAM19	-11.120045	1.210802	-11.120045	2.218539	515.532181	2.053606e-111	3.180572e-110	8
NKAIN4	-10.870906	1.448048	-10.870906	2.126111	505.760666	2.692995e-109	4.035458e-108	8
NCR3LG1	-12.075949	-0.763917	-12.075949	2.292377	501.060931	2.810288e-108	4.154173e-107	8
SLC22A5	-11.578276	0.170129	-11.578276	2.132203	474.607993	1.517762e-102	2.042342e-101	8
MYO15B	-11.396701	0.537205	-11.396701	2.076268	474.538440	1.571358e-102	2.112095e-101	8
NTRK3	-11.368088	0.347978	-11.368088	1.999594	457.076437	9.549802e-99	1.200275e-97	8
FAM53A	-11.332611	0.395974	-11.332611	1.960959	454.251702	3.908746e-98	4.866875e-97	8
HEATR5A	-11.094525	0.736528	-11.094525	1.902362	452.504752	9.344365e-98	1.152728e-96	8
DUSP9	-11.376535	0.265029	-11.376535	1.962851	450.926894	2.053157e-97	2.517267e-96	8
ICAM1	-11.814895	-1.465547	-11.814895	1.841416	439.487508	6.178718e-95	7.256669e-94	8
ZNF433	-11.228434	0.396925	-11.228434	1.841026	439.185301	7.184112e-95	8.412771e-94	8
PUDP	-10.807817	0.955786	-10.807817	1.727613	434.315764	8.153979e-94	9.365762e-93	8
AKR1E2	-10.936276	0.746199	-10.936276	1.712573	428.947245	1.186971e-92	1.336512e-91	8
SMOC2	-11.450998	-0.291171	-11.450998	1.800132	424.803003	9.381073e-92	1.044536e-90	8
IKZF2	-11.050744	0.516850	-11.050744	1.702677	423.554737	1.748534e-91	1.936130e-90	8
CRYL1	-11.076752	0.406209	-11.076752	1.753451	423.107069	2.186026e-91	2.416103e-90	8
BATF3	-11.071490	0.431127	-11.071490	1.736548	422.716472	2.656274e-91	2.927755e-90	8
ADAM33	-11.528694	-0.870812	-11.528694	1.703740	412.838404	3.665708e-89	3.886881e-88	8
CATSPERG	-10.898724	0.530687	-10.898724	1.521126	402.098026	7.775084e-87	7.908842e-86	8
STX1A	-11.018889	0.158636	-11.018889	1.527679	393.569782	5.469630e-85	5.354973e-84	8
LYNX1	-11.172865	-0.186924	-11.172865	1.548000	393.068396	7.023559e-85	6.865128e-84	8
HES4	-11.378759	-1.021150	-11.378759	1.538287	390.663797	2.330148e-84	2.246487e-83	8
ANKRD37	-11.149876	-0.330301	-11.149876	1.487399	383.743503	7.349283e-83	6.875561e-82	8
RIPPLY3	-10.987020	0.139181	-10.987020	1.374208	381.036912	2.834293e-82	2.627028e-81	8
CUL9	-10.986253	-0.066411	-10.986253	1.386002	374.538964	7.240039e-81	6.548771e-80	8
CDK3	-10.801797	0.293696	-10.801797	1.329396	372.648605	1.858379e-80	1.667124e-79	8
MUC1	-10.961591	-0.143128	-10.961591	1.344446	367.871679	2.012074e-79	1.781042e-78	8
ARIH2OS	-10.922176	-0.081686	-10.922176	1.322430	366.152519	4.741798e-79	4.163555e-78	8
ERMAP	-10.898001	-0.397724	-10.898001	1.211818	351.049294	8.840368e-76	7.307915e-75	8
TRIM72	-10.698973	-0.353230	-10.698973	1.014147	331.780927	1.313276e-71	9.976739e-71	8

Code

allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv')
allGenes = allGenes_series['0'].tolist()

Here we load the dictionary that associates to each GO term its genes:

Code

GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code


markers = markers[markers.FDR < 0.01]
markers['-log10(FDR)'] = -np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters	-log10(FDR)
DLL1	-11.228872	3.547954	-11.228872	4.575054	1533.970731	0.000000e+00	0.000000e+00	8	245.058557
ALOX5	-11.178824	4.410238	-11.178824	5.439678	2245.284054	0.000000e+00	0.000000e+00	8	245.058557
VSTM2B	-11.122162	4.797431	-11.122162	5.792537	2511.051934	0.000000e+00	0.000000e+00	8	245.058557
FGF16	-10.984056	3.985368	-10.984056	4.747048	1708.509557	0.000000e+00	0.000000e+00	8	245.058557
PSTPIP2	-11.105241	3.087326	-11.105241	4.054911	1143.210227	1.536158e-247	8.738628e-246	8	245.058557
EGFR	-11.738471	2.489754	-11.738471	4.048274	1108.064238	6.479206e-240	3.534996e-238	8	237.451611
RAI1	-11.129680	2.797734	-11.129680	3.668371	960.524308	6.583281e-208	3.015997e-206	8	205.520569
C19orf85	-11.388111	2.127670	-11.388111	3.255547	793.364089	1.189597e-171	3.933534e-170	8	169.405217
MAFB	-12.466067	0.498430	-12.466067	3.313987	738.617005	8.873436e-160	2.610487e-158	8	157.583279
CCDC163	-10.857963	2.563922	-10.857963	2.945123	735.019095	5.349467e-159	1.550958e-157	8	156.809400
PDE10A	-11.175752	2.150874	-11.175752	3.017974	727.061220	2.844354e-157	8.167652e-156	8	155.087903
LDHD	-11.960379	1.040000	-11.960379	2.997689	675.456270	4.404685e-146	1.101447e-144	8	143.958037
NCALD	-10.934065	2.173099	-10.934065	2.769786	665.571840	6.124912e-144	1.494255e-142	8	141.825575
GRASP	-11.670363	1.321981	-11.670363	2.916703	661.788511	4.049534e-143	9.819507e-142	8	141.007910
GPR157	-11.215989	1.261274	-11.215989	2.453822	556.254469	3.063602e-120	5.423660e-119	8	118.265708
CNTNAP1	-12.219804	-0.561166	-12.219804	2.444274	534.517747	1.576439e-115	2.602751e-114	8	113.584567
CEACAM19	-11.120045	1.210802	-11.120045	2.218539	515.532181	2.053606e-111	3.180572e-110	8	109.497495
NKAIN4	-10.870906	1.448048	-10.870906	2.126111	505.760666	2.692995e-109	4.035458e-108	8	107.394107
NCR3LG1	-12.075949	-0.763917	-12.075949	2.292377	501.060931	2.810288e-108	4.154173e-107	8	106.381515
SLC22A5	-11.578276	0.170129	-11.578276	2.132203	474.607993	1.517762e-102	2.042342e-101	8	100.689871
MYO15B	-11.396701	0.537205	-11.396701	2.076268	474.538440	1.571358e-102	2.112095e-101	8	100.675286
NTRK3	-11.368088	0.347978	-11.368088	1.999594	457.076437	9.549802e-99	1.200275e-97	8	96.920719
FAM53A	-11.332611	0.395974	-11.332611	1.960959	454.251702	3.908746e-98	4.866875e-97	8	96.312750
HEATR5A	-11.094525	0.736528	-11.094525	1.902362	452.504752	9.344365e-98	1.152728e-96	8	95.938273
DUSP9	-11.376535	0.265029	-11.376535	1.962851	450.926894	2.053157e-97	2.517267e-96	8	95.599071
ICAM1	-11.814895	-1.465547	-11.814895	1.841416	439.487508	6.178718e-95	7.256669e-94	8	93.139263
ZNF433	-11.228434	0.396925	-11.228434	1.841026	439.185301	7.184112e-95	8.412771e-94	8	93.075061
PUDP	-10.807817	0.955786	-10.807817	1.727613	434.315764	8.153979e-94	9.365762e-93	8	92.028457
AKR1E2	-10.936276	0.746199	-10.936276	1.712573	428.947245	1.186971e-92	1.336512e-91	8	90.874027
SMOC2	-11.450998	-0.291171	-11.450998	1.800132	424.803003	9.381073e-92	1.044536e-90	8	89.981076
IKZF2	-11.050744	0.516850	-11.050744	1.702677	423.554737	1.748534e-91	1.936130e-90	8	89.713065
CRYL1	-11.076752	0.406209	-11.076752	1.753451	423.107069	2.186026e-91	2.416103e-90	8	89.616885
BATF3	-11.071490	0.431127	-11.071490	1.736548	422.716472	2.656274e-91	2.927755e-90	8	89.533465
ADAM33	-11.528694	-0.870812	-11.528694	1.703740	412.838404	3.665708e-89	3.886881e-88	8	87.410399
CATSPERG	-10.898724	0.530687	-10.898724	1.521126	402.098026	7.775084e-87	7.908842e-86	8	85.101887
STX1A	-11.018889	0.158636	-11.018889	1.527679	393.569782	5.469630e-85	5.354973e-84	8	83.271243
LYNX1	-11.172865	-0.186924	-11.172865	1.548000	393.068396	7.023559e-85	6.865128e-84	8	83.163351
HES4	-11.378759	-1.021150	-11.378759	1.538287	390.663797	2.330148e-84	2.246487e-83	8	82.648496
ANKRD37	-11.149876	-0.330301	-11.149876	1.487399	383.743503	7.349283e-83	6.875561e-82	8	81.162692
RIPPLY3	-10.987020	0.139181	-10.987020	1.374208	381.036912	2.834293e-82	2.627028e-81	8	80.580535
CUL9	-10.986253	-0.066411	-10.986253	1.386002	374.538964	7.240039e-81	6.548771e-80	8	79.183840
CDK3	-10.801797	0.293696	-10.801797	1.329396	372.648605	1.858379e-80	1.667124e-79	8	78.778032
MUC1	-10.961591	-0.143128	-10.961591	1.344446	367.871679	2.012074e-79	1.781042e-78	8	77.749326
ARIH2OS	-10.922176	-0.081686	-10.922176	1.322430	366.152519	4.741798e-79	4.163555e-78	8	77.380536
ERMAP	-10.898001	-0.397724	-10.898001	1.211818	351.049294	8.840368e-76	7.307915e-75	8	74.136206
TRIM72	-10.698973	-0.353230	-10.698973	1.014147	331.780927	1.313276e-71	9.976739e-71	8	70.001011

3.0.1 All regulated

Code

all_sign = markers.index.tolist()
allSelected = allGenes_series['0'].isin(all_sign).astype('int').tolist()

4 topGO

4.1 All significant

Code

%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}

Code

%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([10903,    38,    10,  1350], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py

	GO.ID	Term	Annotated	Significant	Expected	weight	Scores
0	GO:0048665	neuron fate specification	13	2	0.05	0.0009	0.000901
1	GO:0061042	vascular wound healing	13	2	0.05	0.0009	0.000901
2	GO:1900746	regulation of vascular endothelial growt...	15	2	0.05	0.0012	0.001207
3	GO:2001028	positive regulation of endothelial cell ...	15	2	0.05	0.0012	0.001207
4	GO:0051154	negative regulation of striated muscle c...	24	2	0.08	0.0031	0.003110
...	...	...	...	...	...	...	...
5697	GO:2001256	regulation of store-operated calcium ent...	11	0	0.04	1.0000	1.000000
5698	GO:2001257	regulation of cation channel activity	104	0	0.36	1.0000	1.000000
5699	GO:2001258	negative regulation of cation channel ac...	23	0	0.08	1.0000	1.000000
5700	GO:2001259	positive regulation of cation channel ac...	43	0	0.15	1.0000	1.000000
5701	GO:2001267	regulation of cysteine-type endopeptidas...	13	0	0.05	1.0000	1.000000

5702 rows × 7 columns

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

Code

results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']

Code

intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)

Code

%%R -i folder
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')
ggsave(file=paste0(folder, "TopGO_results_BP.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_BP.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11203,    37,    10,   189], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_MF.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_MF.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11323,    43,    10,   202], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_CC.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes:_in_Term_results_CC.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.1.0.1 Reactome

Code

curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR

Code

rank_copy

	-log10(FDR)	pval
DLL1	245.058557	0.000000e+00
ALOX5	245.058557	0.000000e+00
VSTM2B	245.058557	0.000000e+00
FGF16	245.058557	0.000000e+00
PSTPIP2	245.058557	8.738628e-246
EGFR	237.451611	3.534996e-238
RAI1	205.520569	3.015997e-206
C19orf85	169.405217	3.933534e-170
MAFB	157.583279	2.610487e-158
CCDC163	156.809400	1.550958e-157
PDE10A	155.087903	8.167652e-156
LDHD	143.958037	1.101447e-144
NCALD	141.825575	1.494255e-142
GRASP	141.007910	9.819507e-142
GPR157	118.265708	5.423660e-119
CNTNAP1	113.584567	2.602751e-114
CEACAM19	109.497495	3.180572e-110
NKAIN4	107.394107	4.035458e-108
NCR3LG1	106.381515	4.154173e-107
SLC22A5	100.689871	2.042342e-101
MYO15B	100.675286	2.112095e-101
NTRK3	96.920719	1.200275e-97
FAM53A	96.312750	4.866875e-97
HEATR5A	95.938273	1.152728e-96
DUSP9	95.599071	2.517267e-96
ICAM1	93.139263	7.256669e-94
ZNF433	93.075061	8.412771e-94
PUDP	92.028457	9.365762e-93
AKR1E2	90.874027	1.336512e-91
SMOC2	89.981076	1.044536e-90
IKZF2	89.713065	1.936130e-90
CRYL1	89.616885	2.416103e-90
BATF3	89.533465	2.927755e-90
ADAM33	87.410399	3.886881e-88
CATSPERG	85.101887	7.908842e-86
STX1A	83.271243	5.354973e-84
LYNX1	83.163351	6.865128e-84
HES4	82.648496	2.246487e-83
ANKRD37	81.162692	6.875561e-82
RIPPLY3	80.580535	2.627028e-81
CUL9	79.183840	6.548771e-80
CDK3	78.778032	1.667124e-79
MUC1	77.749326	1.781042e-78
ARIH2OS	77.380536	4.163555e-78
ERMAP	74.136206	7.307915e-75
TRIM72	70.001011	9.976739e-71

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.1.0.2 KEGG

Code

curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)