1 Load environment

Code

import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.setrecursionlimit(20000)

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code

%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

plt.rcParams['svg.fonttype'] = 'none'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code

default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code

# Injected Parameters
N = 3

Code

# Injected Parameters
N = 9

Import R libraries:

Code

%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code

output_folder = './'
folder = './tables/cluster_' + str(N) + '/'

2 Load data

Here we load the dataframe:

Code

markers = pd.read_excel(folder + 'genes_in_cluster_' + str(N) + '.xlsx', index_col = 0)
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters
AHNAK	12.038674	16.815994	12.683582	6.820741	2711.850761	0.000000e+00	0.000000e+00	9
CYP1B1	11.049560	15.539232	13.598500	5.564479	1513.799008	0.000000e+00	0.000000e+00	9
SOX15	11.772982	15.560193	11.991457	5.543767	1455.402578	2.799468e-315	2.847628e-313	9
ITGA9	11.380525	14.989802	11.192375	4.937401	1193.524131	1.863147e-258	1.164758e-256	9
CPT1A	10.664611	14.748879	13.905565	4.884844	1144.688380	7.340728e-248	4.195750e-246	9
...	...	...	...	...	...	...	...	...
ZNF442	10.679139	11.158147	10.895534	1.994474	213.368877	5.445828e-46	2.713419e-45	9
TCTE3	10.629624	10.788777	11.380745	2.005768	213.152961	6.063615e-46	3.017478e-45	9
MAP1A	10.747680	11.084400	10.864300	2.025312	212.943131	6.731066e-46	3.346851e-45	9
ZNF319	10.838910	10.798703	10.952749	1.998963	209.432434	3.862299e-45	1.902305e-44	9
ZNF79	10.835524	10.686426	10.871299	1.959691	206.477157	1.680813e-44	8.164628e-44	9

106 rows × 8 columns

Code

allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv')
allGenes = allGenes_series['0'].tolist()

Here we load the dictionary that associates to each GO term its genes:

Code

GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code


markers = markers[markers.FDR < 0.01]
markers['-log10(FDR)'] = -np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters	-log10(FDR)
AHNAK	12.038674	16.815994	12.683582	6.820741	2711.850761	0.000000e+00	0.000000e+00	9	312.545517
CYP1B1	11.049560	15.539232	13.598500	5.564479	1513.799008	0.000000e+00	0.000000e+00	9	312.545517
SOX15	11.772982	15.560193	11.991457	5.543767	1455.402578	2.799468e-315	2.847628e-313	9	312.545517
ITGA9	11.380525	14.989802	11.192375	4.937401	1193.524131	1.863147e-258	1.164758e-256	9	255.933764
CPT1A	10.664611	14.748879	13.905565	4.884844	1144.688380	7.340728e-248	4.195750e-246	9	245.377190
...	...	...	...	...	...	...	...	...	...
ZNF442	10.679139	11.158147	10.895534	1.994474	213.368877	5.445828e-46	2.713419e-45	9	44.566483
TCTE3	10.629624	10.788777	11.380745	2.005768	213.152961	6.063615e-46	3.017478e-45	9	44.520356
MAP1A	10.747680	11.084400	10.864300	2.025312	212.943131	6.731066e-46	3.346851e-45	9	44.475364
ZNF319	10.838910	10.798703	10.952749	1.998963	209.432434	3.862299e-45	1.902305e-44	9	43.720720
ZNF79	10.835524	10.686426	10.871299	1.959691	206.477157	1.680813e-44	8.164628e-44	9	43.088064

106 rows × 9 columns

3.0.1 All regulated

Code

all_sign = markers.index.tolist()
allSelected = allGenes_series['0'].isin(all_sign).astype('int').tolist()

4 topGO

4.1 All significant

Code

%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}

Code

%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([10903,    87,    10,  2071], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py

	GO.ID	Term	Annotated	Significant	Expected	weight	Scores
0	GO:0006357	regulation of transcription by RNA polym...	1820	28	14.52	0.00035	0.000347
1	GO:0007614	short-term memory	11	2	0.09	0.00330	0.003304
2	GO:1901862	negative regulation of muscle tissue dev...	11	2	0.09	0.00330	0.003304
3	GO:1903799	negative regulation of miRNA maturation	13	2	0.10	0.00464	0.004637
4	GO:0016202	regulation of striated muscle tissue dev...	14	2	0.11	0.00538	0.005382
...	...	...	...	...	...	...	...
5697	GO:2001241	positive regulation of extrinsic apoptot...	10	0	0.08	1.00000	1.000000
5698	GO:2001244	positive regulation of intrinsic apoptot...	42	0	0.34	1.00000	1.000000
5699	GO:2001251	negative regulation of chromosome organi...	88	0	0.70	1.00000	1.000000
5700	GO:2001256	regulation of store-operated calcium ent...	11	0	0.09	1.00000	1.000000
5701	GO:2001267	regulation of cysteine-type endopeptidas...	13	0	0.10	1.00000	1.000000

5702 rows × 7 columns

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

Code

results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']

Code

intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)

Code

%%R -i folder
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')
ggsave(file=paste0(folder, "TopGO_results_BP.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_BP.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11203,    94,    10,   292], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_MF.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_MF.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11323,    96,    10,   298], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_CC.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes:_in_Term_results_CC.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.1.0.1 Reactome

Code

curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR

Code

rank_copy

	-log10(FDR)	pval
AHNAK	312.545517	0.000000e+00
CYP1B1	312.545517	0.000000e+00
SOX15	312.545517	2.847628e-313
ITGA9	255.933764	1.164758e-256
CPT1A	245.377190	4.195750e-246
...	...	...
ZNF442	44.566483	2.713419e-45
TCTE3	44.520356	3.017478e-45
MAP1A	44.475364	3.346851e-45
ZNF319	43.720720	1.902305e-44
ZNF79	43.088064	8.164628e-44

106 rows × 2 columns

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.1.0.2 KEGG

Code

curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)