1 Load environment

Code

import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.setrecursionlimit(20000)

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code

%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

plt.rcParams['svg.fonttype'] = 'none'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code

default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code

# Injected Parameters
N = 3

Code

# Injected Parameters
N = 7

Import R libraries:

Code

%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code

output_folder = './'
folder = './tables/cluster_' + str(N) + '/'

2 Load data

Here we load the dataframe:

Code

markers = pd.read_excel(folder + 'genes_in_cluster_' + str(N) + '.xlsx', index_col = 0)
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters
TCL1A	11.389884	17.660164	0	7.639515	3914.736661	0.000000e+00	0.000000e+00	7
ARID5B	11.235232	15.714693	0	5.605302	2032.295508	0.000000e+00	0.000000e+00	7
BARX1	11.049682	16.213512	0	6.131787	2558.952569	0.000000e+00	0.000000e+00	7
LAMA4	10.852847	17.611657	0	7.580034	4066.761890	0.000000e+00	0.000000e+00	7
MSC	10.866905	14.894956	0	4.732514	1452.945397	9.556065e-315	9.638777e-313	7
...	...	...	...	...	...	...	...	...
SIRT4	10.583195	10.861387	0	1.429105	285.778724	1.189612e-61	7.660363e-61	7
KSR2	10.640697	10.669002	0	1.473940	284.419490	2.341700e-61	1.497465e-60	7
NFIC	10.429091	10.910541	0	1.395141	282.712857	5.480549e-61	3.482426e-60	7
ZNF658	10.604303	10.636442	0	1.427180	280.633694	1.544248e-60	9.735088e-60	7
HCN4	10.637863	10.413336	0	1.376229	275.691602	1.811540e-59	1.124297e-58	7

87 rows × 8 columns

Code

allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv')
allGenes = allGenes_series['0'].tolist()

Here we load the dictionary that associates to each GO term its genes:

Code

GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code


markers = markers[markers.FDR < 0.01]
markers['-log10(FDR)'] = -np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters	-log10(FDR)
TCL1A	11.389884	17.660164	0	7.639515	3914.736661	0.000000e+00	0.000000e+00	7	312.015978
ARID5B	11.235232	15.714693	0	5.605302	2032.295508	0.000000e+00	0.000000e+00	7	312.015978
BARX1	11.049682	16.213512	0	6.131787	2558.952569	0.000000e+00	0.000000e+00	7	312.015978
LAMA4	10.852847	17.611657	0	7.580034	4066.761890	0.000000e+00	0.000000e+00	7	312.015978
MSC	10.866905	14.894956	0	4.732514	1452.945397	9.556065e-315	9.638777e-313	7	312.015978
...	...	...	...	...	...	...	...	...	...
SIRT4	10.583195	10.861387	0	1.429105	285.778724	1.189612e-61	7.660363e-61	7	60.115751
KSR2	10.640697	10.669002	0	1.473940	284.419490	2.341700e-61	1.497465e-60	7	59.824643
NFIC	10.429091	10.910541	0	1.395141	282.712857	5.480549e-61	3.482426e-60	7	59.458118
ZNF658	10.604303	10.636442	0	1.427180	280.633694	1.544248e-60	9.735088e-60	7	59.011660
HCN4	10.637863	10.413336	0	1.376229	275.691602	1.811540e-59	1.124297e-58	7	57.949119

87 rows × 9 columns

3.0.1 All regulated

Code

all_sign = markers.index.tolist()
allSelected = allGenes_series['0'].isin(all_sign).astype('int').tolist()

4 topGO

4.1 All significant

Code

%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}

Code

%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([10903,    80,    10,  1816], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py

	GO.ID	Term	Annotated	Significant	Expected	weight	Scores
0	GO:0060977	coronary vasculature morphogenesis	13	2	0.10	0.0039	0.003935
1	GO:0050901	leukocyte tethering or rolling	16	2	0.12	0.0060	0.005969
2	GO:1900745	positive regulation of p38MAPK cascade	20	2	0.15	0.0093	0.009273
3	GO:0003161	cardiac conduction system development	22	2	0.16	0.0112	0.011167
4	GO:0022011	myelination in peripheral nervous system	23	2	0.17	0.0122	0.012173
...	...	...	...	...	...	...	...
5697	GO:2001252	positive regulation of chromosome organi...	93	0	0.68	1.0000	1.000000
5698	GO:2001256	regulation of store-operated calcium ent...	11	0	0.08	1.0000	1.000000
5699	GO:2001258	negative regulation of cation channel ac...	23	0	0.17	1.0000	1.000000
5700	GO:2001259	positive regulation of cation channel ac...	43	0	0.32	1.0000	1.000000
5701	GO:2001267	regulation of cysteine-type endopeptidas...	13	0	0.10	1.0000	1.000000

5702 rows × 7 columns

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

Code

results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']

Code

intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)

Code

%%R -i folder
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')
ggsave(file=paste0(folder, "TopGO_results_BP.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_BP.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11203,    83,    10,   292], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_MF.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_MF.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11323,    81,    10,   237], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_CC.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes:_in_Term_results_CC.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.1.0.1 Reactome

Code

curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR

Code

rank_copy

	-log10(FDR)	pval
TCL1A	312.015978	0.000000e+00
ARID5B	312.015978	0.000000e+00
BARX1	312.015978	0.000000e+00
LAMA4	312.015978	0.000000e+00
MSC	312.015978	9.638777e-313
...	...	...
SIRT4	60.115751	7.660363e-61
KSR2	59.824643	1.497465e-60
NFIC	59.458118	3.482426e-60
ZNF658	59.011660	9.735088e-60
HCN4	57.949119	1.124297e-58

87 rows × 2 columns

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.1.0.2 KEGG

Code

curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)