1 Load environment

Code
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.setrecursionlimit(20000)

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code
%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

plt.rcParams['svg.fonttype'] = 'none'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code
default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 0

Import R libraries:

Code
%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code
output_folder = './'
folder = './tables/cluster_' + str(N) + '/'

2 Load data

Here we load the dataframe:

Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str(N) + '.xlsx', index_col = 0)
markers
logFC.celltypes_leveledhEGCLC logFC.celltypes_leveledhPGCLC logFC.celltypes_levelediMeLC logCPM LR PValue FDR clusters
CXCR4 -11.172324 7.322004 0.804604 8.483506 3951.022844 0.000000e+00 0.000000e+00 0
FGFR3 -11.002481 4.398754 2.306831 5.388332 1732.310638 0.000000e+00 0.000000e+00 0
ZNF703 -11.000985 6.292293 2.813550 7.291843 3595.651687 0.000000e+00 0.000000e+00 0
MORC4 -10.933221 4.944114 0.058590 5.774889 2161.147921 0.000000e+00 0.000000e+00 0
KLF4 -10.795117 6.570967 2.325890 7.348871 3722.692849 0.000000e+00 0.000000e+00 0
ZYG11A -11.832004 3.313038 0.512455 5.101194 1489.567078 1.086944e-322 1.165007e-320 0
VENTX -11.812635 3.107866 -0.353142 4.823694 1344.925545 2.628696e-291 2.206450e-289 0
WLS -11.822818 1.642495 3.544481 4.750616 1214.091220 6.424869e-263 4.191180e-261 0
PLAAT5 -11.705724 2.655509 2.072497 4.575988 1072.617287 3.174551e-232 1.686024e-230 0
FOSB -10.759300 2.303687 3.482677 3.882798 786.119387 4.431795e-170 1.445512e-168 0
SMAD6 -11.133028 2.567099 0.787656 3.688101 711.711762 6.059761e-154 1.668241e-152 0
SFMBT2 -11.301186 2.386058 0.675743 3.661201 694.787689 2.833088e-150 7.424793e-149 0
CPEB2 -10.992624 2.551735 1.528002 3.619749 680.196740 4.130942e-147 1.048281e-145 0
MYB -11.219598 2.138289 0.235243 3.345501 607.049020 2.987073e-131 6.181695e-130 0
HTR1D -11.940274 -0.000371 1.866980 3.475240 597.634960 3.281710e-129 6.543250e-128 0
FLRT2 -10.661898 -10.661898 2.594260 2.418750 577.838936 6.418928e-125 1.217163e-123 0
POMC -11.098515 1.907786 0.257493 3.080348 531.334862 7.718626e-115 1.255375e-113 0
TGFBR3L -12.285498 -12.285498 -1.135665 2.311232 516.979118 9.975280e-112 1.559027e-110 0
CD1D -11.036460 1.641919 1.741398 3.076538 514.638767 3.207336e-111 4.941932e-110 0
PLEKHA7 -11.399815 1.461997 0.103141 3.049715 504.889676 4.159066e-109 6.224597e-108 0
IRF9 -11.055282 1.867515 0.647282 2.966910 504.741364 4.478554e-109 6.686079e-108 0
CUZD1 -11.654184 -11.654184 0.713973 2.272964 502.745032 1.212771e-108 1.797146e-107 0
CR1L -11.365167 -11.365167 1.158795 2.204925 494.362252 7.951324e-107 1.145735e-105 0
CEBPD -11.145091 1.602761 -0.036129 2.902001 482.454019 3.026958e-104 4.185780e-103 0
SP8 -11.180913 -11.180913 1.308208 2.113101 477.366432 3.832386e-103 5.209528e-102 0
CPNE8 -11.789643 0.809946 -0.775524 2.867771 461.385381 1.112622e-99 1.417707e-98 0
SOX8 -11.762601 -11.762601 -0.079794 2.081350 460.900518 1.417121e-99 1.798066e-98 0
EGR3 -10.838320 1.267103 1.796737 2.715622 441.429992 2.344481e-95 2.769764e-94 0
TNMD -11.424248 -11.424248 0.458838 1.950109 441.172775 2.665475e-95 3.142799e-94 0
SPON1 -11.067239 -11.067239 0.987076 1.848919 429.450846 9.232904e-93 1.041565e-91 0
SLC27A2 -11.491278 -11.491278 0.009423 1.858953 425.456535 6.771301e-92 7.581709e-91 0
SLCO5A1 -11.594968 -0.200657 0.874684 2.660573 409.654839 1.793855e-88 1.887085e-87 0
VIPR1 -11.305458 -11.305458 0.147051 1.731361 406.139609 1.035733e-87 1.072640e-86 0
PLIN2 -10.820458 1.107431 1.439851 2.531766 403.988732 3.028009e-87 3.109084e-86 0
PARD3B -11.203304 -11.203304 0.218544 1.663814 395.953948 1.665512e-85 1.646717e-84 0
BEND6 -11.161487 -11.161487 0.314171 1.652496 395.183341 2.446031e-85 2.414450e-84 0
ERAP1 -11.363301 -11.363301 -0.299452 1.650015 394.521479 3.402681e-85 3.350483e-84 0
GRIA3 -11.241155 -11.241155 0.045862 1.638949 392.317321 1.021499e-84 9.911924e-84 0
CNTFR -11.318841 0.235529 0.836421 2.524169 386.071293 2.301856e-83 2.172106e-82 0
SLC10A4 -11.028984 -11.028984 0.385695 1.560395 381.607164 2.132746e-82 1.979842e-81 0
CYP27A1 -11.610612 0.093257 -0.242183 2.447008 375.567416 4.335184e-81 3.933123e-80 0
CSF1 -11.009662 -11.009662 0.261565 1.495500 372.099518 2.443703e-80 2.187305e-79 0
EXOG -10.969232 -10.969232 -0.029743 1.361055 353.100917 3.178516e-76 2.643917e-75 0
NUP62CL -11.422266 0.052553 -0.540530 2.194061 342.222699 7.205153e-74 5.742593e-73 0
PCDHGC3 -11.148685 -0.327086 0.620974 2.102311 328.821400 5.742157e-71 4.315787e-70 0
EGR2 -10.869229 0.639334 0.303449 1.997976 324.396475 5.212082e-70 3.826337e-69 0
IL17RB -11.093504 0.061843 0.012494 1.967727 313.503087 1.188802e-67 8.408477e-67 0
NEURL2 -10.862253 0.329810 0.390332 1.930991 310.946811 4.250413e-67 2.985237e-66 0
IRF1 -11.011890 0.122855 -0.082426 1.890139 306.187842 4.555127e-66 3.145868e-65 0
TNFRSF25 -10.973480 0.008988 0.141392 1.822972 300.989375 6.076299e-65 4.132227e-64 0
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv')
allGenes = allGenes_series['0'].tolist()

Here we load the dictionary that associates to each GO term its genes:

Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code

markers = markers[markers.FDR < 0.01]
markers['-log10(FDR)'] = -np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers
logFC.celltypes_leveledhEGCLC logFC.celltypes_leveledhPGCLC logFC.celltypes_levelediMeLC logCPM LR PValue FDR clusters -log10(FDR)
CXCR4 -11.172324 7.322004 0.804604 8.483506 3951.022844 0.000000e+00 0.000000e+00 0 319.933672
FGFR3 -11.002481 4.398754 2.306831 5.388332 1732.310638 0.000000e+00 0.000000e+00 0 319.933672
ZNF703 -11.000985 6.292293 2.813550 7.291843 3595.651687 0.000000e+00 0.000000e+00 0 319.933672
MORC4 -10.933221 4.944114 0.058590 5.774889 2161.147921 0.000000e+00 0.000000e+00 0 319.933672
KLF4 -10.795117 6.570967 2.325890 7.348871 3722.692849 0.000000e+00 0.000000e+00 0 319.933672
ZYG11A -11.832004 3.313038 0.512455 5.101194 1489.567078 1.086944e-322 1.165007e-320 0 319.933672
VENTX -11.812635 3.107866 -0.353142 4.823694 1344.925545 2.628696e-291 2.206450e-289 0 288.656306
WLS -11.822818 1.642495 3.544481 4.750616 1214.091220 6.424869e-263 4.191180e-261 0 260.377664
PLAAT5 -11.705724 2.655509 2.072497 4.575988 1072.617287 3.174551e-232 1.686024e-230 0 229.773136
FOSB -10.759300 2.303687 3.482677 3.882798 786.119387 4.431795e-170 1.445512e-168 0 167.839978
SMAD6 -11.133028 2.567099 0.787656 3.688101 711.711762 6.059761e-154 1.668241e-152 0 151.777741
SFMBT2 -11.301186 2.386058 0.675743 3.661201 694.787689 2.833088e-150 7.424793e-149 0 148.129316
CPEB2 -10.992624 2.551735 1.528002 3.619749 680.196740 4.130942e-147 1.048281e-145 0 144.979522
MYB -11.219598 2.138289 0.235243 3.345501 607.049020 2.987073e-131 6.181695e-130 0 129.208892
HTR1D -11.940274 -0.000371 1.866980 3.475240 597.634960 3.281710e-129 6.543250e-128 0 127.184207
FLRT2 -10.661898 -10.661898 2.594260 2.418750 577.838936 6.418928e-125 1.217163e-123 0 122.914651
POMC -11.098515 1.907786 0.257493 3.080348 531.334862 7.718626e-115 1.255375e-113 0 112.901227
TGFBR3L -12.285498 -12.285498 -1.135665 2.311232 516.979118 9.975280e-112 1.559027e-110 0 109.807146
CD1D -11.036460 1.641919 1.741398 3.076538 514.638767 3.207336e-111 4.941932e-110 0 109.306103
PLEKHA7 -11.399815 1.461997 0.103141 3.049715 504.889676 4.159066e-109 6.224597e-108 0 107.205889
IRF9 -11.055282 1.867515 0.647282 2.966910 504.741364 4.478554e-109 6.686079e-108 0 107.174828
CUZD1 -11.654184 -11.654184 0.713973 2.272964 502.745032 1.212771e-108 1.797146e-107 0 106.745417
CR1L -11.365167 -11.365167 1.158795 2.204925 494.362252 7.951324e-107 1.145735e-105 0 104.940916
CEBPD -11.145091 1.602761 -0.036129 2.902001 482.454019 3.026958e-104 4.185780e-103 0 102.378224
SP8 -11.180913 -11.180913 1.308208 2.113101 477.366432 3.832386e-103 5.209528e-102 0 101.283202
CPNE8 -11.789643 0.809946 -0.775524 2.867771 461.385381 1.112622e-99 1.417707e-98 0 97.848413
SOX8 -11.762601 -11.762601 -0.079794 2.081350 460.900518 1.417121e-99 1.798066e-98 0 97.745194
EGR3 -10.838320 1.267103 1.796737 2.715622 441.429992 2.344481e-95 2.769764e-94 0 93.557557
TNMD -11.424248 -11.424248 0.458838 1.950109 441.172775 2.665475e-95 3.142799e-94 0 93.502683
SPON1 -11.067239 -11.067239 0.987076 1.848919 429.450846 9.232904e-93 1.041565e-91 0 90.982313
SLC27A2 -11.491278 -11.491278 0.009423 1.858953 425.456535 6.771301e-92 7.581709e-91 0 90.120233
SLCO5A1 -11.594968 -0.200657 0.874684 2.660573 409.654839 1.793855e-88 1.887085e-87 0 86.724209
VIPR1 -11.305458 -11.305458 0.147051 1.731361 406.139609 1.035733e-87 1.072640e-86 0 85.969546
PLIN2 -10.820458 1.107431 1.439851 2.531766 403.988732 3.028009e-87 3.109084e-86 0 85.507367
PARD3B -11.203304 -11.203304 0.218544 1.663814 395.953948 1.665512e-85 1.646717e-84 0 83.783381
BEND6 -11.161487 -11.161487 0.314171 1.652496 395.183341 2.446031e-85 2.414450e-84 0 83.617182
ERAP1 -11.363301 -11.363301 -0.299452 1.650015 394.521479 3.402681e-85 3.350483e-84 0 83.474893
GRIA3 -11.241155 -11.241155 0.045862 1.638949 392.317321 1.021499e-84 9.911924e-84 0 83.003842
CNTFR -11.318841 0.235529 0.836421 2.524169 386.071293 2.301856e-83 2.172106e-82 0 81.663119
SLC10A4 -11.028984 -11.028984 0.385695 1.560395 381.607164 2.132746e-82 1.979842e-81 0 80.703370
CYP27A1 -11.610612 0.093257 -0.242183 2.447008 375.567416 4.335184e-81 3.933123e-80 0 79.405262
CSF1 -11.009662 -11.009662 0.261565 1.495500 372.099518 2.443703e-80 2.187305e-79 0 78.660091
EXOG -10.969232 -10.969232 -0.029743 1.361055 353.100917 3.178516e-76 2.643917e-75 0 74.577752
NUP62CL -11.422266 0.052553 -0.540530 2.194061 342.222699 7.205153e-74 5.742593e-73 0 72.240892
PCDHGC3 -11.148685 -0.327086 0.620974 2.102311 328.821400 5.742157e-71 4.315787e-70 0 69.364940
EGR2 -10.869229 0.639334 0.303449 1.997976 324.396475 5.212082e-70 3.826337e-69 0 68.417217
IL17RB -11.093504 0.061843 0.012494 1.967727 313.503087 1.188802e-67 8.408477e-67 0 66.075283
NEURL2 -10.862253 0.329810 0.390332 1.930991 310.946811 4.250413e-67 2.985237e-66 0 65.525021
IRF1 -11.011890 0.122855 -0.082426 1.890139 306.187842 4.555127e-66 3.145868e-65 0 64.502260
TNFRSF25 -10.973480 0.008988 0.141392 1.822972 300.989375 6.076299e-65 4.132227e-64 0 63.383816

3.0.1 All regulated

Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0'].isin(all_sign).astype('int').tolist()

4 topGO

4.1 All significant

Code
%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}
Code
%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903,    48,    10,  1519], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py
GO.ID Term Annotated Significant Expected weight Scores
0 GO:0045444 fat cell differentiation 170 6 0.75 9.4e-05 0.000094
1 GO:1902895 positive regulation of miRNA transcripti... 39 3 0.17 0.00065 0.000655
2 GO:0007155 cell adhesion 913 15 4.02 0.00074 0.000737
3 GO:0019883 antigen processing and presentation of e... 10 2 0.04 0.00084 0.000835
4 GO:0045944 positive regulation of transcription by ... 873 11 3.84 0.00117 0.001172
... ... ... ... ... ... ... ...
5697 GO:2001256 regulation of store-operated calcium ent... 11 0 0.05 1.00000 1.000000
5698 GO:2001257 regulation of cation channel activity 104 0 0.46 1.00000 1.000000
5699 GO:2001258 negative regulation of cation channel ac... 23 0 0.10 1.00000 1.000000
5700 GO:2001259 positive regulation of cation channel ac... 43 0 0.19 1.00000 1.000000
5701 GO:2001267 regulation of cysteine-type endopeptidas... 13 0 0.06 1.00000 1.000000

5702 rows × 7 columns

Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]
Code
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)
Code
%%R -i folder
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')
ggsave(file=paste0(folder, "TopGO_results_BP.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_BP.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203,    48,    10,   246], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_MF.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_MF.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323,    50,    10,   167], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_CC.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes:_in_Term_results_CC.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.1.0.1 Reactome

Code
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR
Code
rank_copy
-log10(FDR) pval
CXCR4 319.933672 0.000000e+00
FGFR3 319.933672 0.000000e+00
ZNF703 319.933672 0.000000e+00
MORC4 319.933672 0.000000e+00
KLF4 319.933672 0.000000e+00
ZYG11A 319.933672 1.165007e-320
VENTX 288.656306 2.206450e-289
WLS 260.377664 4.191180e-261
PLAAT5 229.773136 1.686024e-230
FOSB 167.839978 1.445512e-168
SMAD6 151.777741 1.668241e-152
SFMBT2 148.129316 7.424793e-149
CPEB2 144.979522 1.048281e-145
MYB 129.208892 6.181695e-130
HTR1D 127.184207 6.543250e-128
FLRT2 122.914651 1.217163e-123
POMC 112.901227 1.255375e-113
TGFBR3L 109.807146 1.559027e-110
CD1D 109.306103 4.941932e-110
PLEKHA7 107.205889 6.224597e-108
IRF9 107.174828 6.686079e-108
CUZD1 106.745417 1.797146e-107
CR1L 104.940916 1.145735e-105
CEBPD 102.378224 4.185780e-103
SP8 101.283202 5.209528e-102
CPNE8 97.848413 1.417707e-98
SOX8 97.745194 1.798066e-98
EGR3 93.557557 2.769764e-94
TNMD 93.502683 3.142799e-94
SPON1 90.982313 1.041565e-91
SLC27A2 90.120233 7.581709e-91
SLCO5A1 86.724209 1.887085e-87
VIPR1 85.969546 1.072640e-86
PLIN2 85.507367 3.109084e-86
PARD3B 83.783381 1.646717e-84
BEND6 83.617182 2.414450e-84
ERAP1 83.474893 3.350483e-84
GRIA3 83.003842 9.911924e-84
CNTFR 81.663119 2.172106e-82
SLC10A4 80.703370 1.979842e-81
CYP27A1 79.405262 3.933123e-80
CSF1 78.660091 2.187305e-79
EXOG 74.577752 2.643917e-75
NUP62CL 72.240892 5.742593e-73
PCDHGC3 69.364940 4.315787e-70
EGR2 68.417217 3.826337e-69
IL17RB 66.075283 8.408477e-67
NEURL2 65.525021 2.985237e-66
IRF1 64.502260 3.145868e-65
TNFRSF25 63.383816 4.132227e-64
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)
No significant term was found
0
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)
Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.1.0.2 KEGG

Code
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)
Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)