1 Load environment

Code
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.setrecursionlimit(20000)

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code
%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

plt.rcParams['svg.fonttype'] = 'none'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code
default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code
# Injected Parameters
N = 3
Code
# Injected Parameters
N = 14

Import R libraries:

Code
%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code
output_folder = './'
folder = './tables/cluster_' + str(N) + '/'

2 Load data

Here we load the dataframe:

Code
markers = pd.read_excel(folder + 'genes_in_cluster_' + str(N) + '.xlsx', index_col = 0)
markers
logFC.celltypes_leveledhEGCLC logFC.celltypes_leveledhPGCLC logFC.celltypes_levelediMeLC logCPM LR PValue FDR clusters
KRTDAP -12.016842 -12.016842 -12.016842 1.785090 554.411579 7.685864e-120 1.352689e-118 14
PVALB -11.661893 -11.661893 -11.661893 1.446665 492.726668 1.798396e-106 2.575912e-105 14
CYGB -11.590367 -11.590367 -11.590367 1.388522 481.503036 4.865001e-104 6.712024e-103 14
BCHE -11.552081 -11.552081 -11.552081 1.338340 474.309595 1.761424e-102 2.362276e-101 14
NFIA -11.543503 -11.543503 -11.543503 1.338488 473.452072 2.701975e-102 3.599535e-101 14
NPY5R -11.522267 -11.522267 -11.522267 1.332806 470.944021 9.443850e-102 1.244287e-100 14
NPW -11.516360 -11.516360 -11.516360 1.313345 469.078150 2.395850e-101 3.136029e-100 14
GNMT -11.505465 -11.505465 -11.505465 1.299169 467.082914 6.483327e-101 8.431134e-100 14
FIBCD1 -11.444936 -11.444936 -11.444936 1.242440 458.622018 4.416779e-99 5.568760e-98 14
CCDC78 -11.407576 -11.407576 -11.407576 1.215570 453.788973 4.923766e-98 6.105368e-97 14
AURKC -11.389224 -11.389224 -11.389224 1.180732 450.515009 2.521530e-97 3.085212e-96 14
BHLHE41 -11.355354 -11.355354 -11.355354 1.165658 446.563530 1.810559e-96 2.181941e-95 14
SLC16A12 -11.336495 -11.336495 -11.336495 1.159348 444.463544 5.161781e-96 6.164862e-95 14
FAM3B -11.253109 -11.253109 -11.253109 1.081410 433.167816 1.445677e-93 1.652615e-92 14
SLC30A3 -11.240689 -11.240689 -11.240689 1.059236 431.063008 4.131157e-93 4.700121e-92 14
CA7 -11.213118 -11.213118 -11.213118 1.024725 427.018972 3.105887e-92 3.490633e-91 14
C1QTNF6 -11.175776 -11.175776 -11.175776 0.989791 422.055403 3.693889e-91 4.056519e-90 14
DDX60 -11.165692 -11.165692 -11.165692 0.990941 421.148868 5.805916e-91 6.341075e-90 14
MCOLN2 -11.165931 -11.165931 -11.165931 0.983494 420.869763 6.673202e-91 7.275063e-90 14
KCNQ4 -11.157494 -11.157494 -11.157494 0.975465 419.749288 1.166987e-90 1.271084e-89 14
CST6 -11.144914 -11.144914 -11.144914 0.968273 418.274329 2.435504e-90 2.640772e-89 14
ALOX12 -11.132350 -11.132350 -11.132350 0.948100 416.282664 6.577212e-90 7.074039e-89 14
REXO5 -11.126199 -11.126199 -11.126199 0.942958 415.499184 9.722216e-90 1.042858e-88 14
HRH3 -11.115619 -11.115619 -11.115619 0.952246 414.882838 1.322155e-89 1.414423e-88 14
ARMC4 -11.105176 -11.105176 -11.105176 0.931578 413.075683 3.256548e-89 3.459146e-88 14
HRC -11.092823 -11.092823 -11.092823 0.911069 411.104743 8.703784e-89 9.196437e-88 14
DHDH -11.091171 -11.091171 -11.091171 0.909455 410.886547 9.704519e-89 1.024480e-87 14
RASGRP1 -11.083438 -11.083438 -11.083438 0.917490 410.487358 1.184260e-88 1.247996e-87 14
SHISA6 -11.054033 -11.054033 -11.054033 0.888521 406.606404 8.206021e-88 8.513126e-87 14
MCTP2 -11.056345 -11.056345 -11.056345 0.872171 406.173908 1.018164e-87 1.055356e-86 14
CLIC6 -11.041824 -11.041824 -11.041824 0.879920 405.137714 1.707153e-87 1.758881e-86 14
CFP -11.040070 -11.040070 -11.040070 0.846821 403.674026 3.542634e-87 3.634379e-86 14
DPP4 -11.026833 -11.026833 -11.026833 0.861772 403.037524 4.866312e-87 4.971093e-86 14
CDKL2 -11.028050 -11.028050 -11.028050 0.849949 402.683933 5.804861e-87 5.914749e-86 14
HCLS1 -11.023099 -11.023099 -11.023099 0.819798 401.057680 1.306332e-86 1.323199e-85 14
HMX2 -11.003578 -11.003578 -11.003578 0.865132 401.044914 1.314676e-86 1.330528e-85 14
GRIN1 -11.013697 -11.013697 -11.013697 0.837658 400.880668 1.426908e-86 1.442896e-85 14
FBXL2 -11.009009 -11.009009 -11.009009 0.848432 400.874333 1.431424e-86 1.446244e-85 14
ALS2CL -11.013210 -11.013210 -11.013210 0.834624 400.717361 1.547995e-86 1.560083e-85 14
MICB -11.008766 -11.008766 -11.008766 0.831651 400.192631 2.011084e-86 2.023390e-85 14
KRBOX4 -11.008902 -11.008902 -11.008902 0.826818 400.016898 2.195307e-86 2.206890e-85 14
GUCA1B -10.986123 -10.986123 -10.986123 0.805985 397.116645 9.326200e-86 9.259089e-85 14
AZU1 -10.955768 -10.955768 -10.955768 0.765337 392.777428 8.120450e-85 7.917933e-84 14
ADAMTS3 -10.945374 -10.945374 -10.945374 0.789280 392.756470 8.205772e-85 7.994633e-84 14
ADORA1 -10.920705 -10.920705 -10.920705 0.773319 389.907541 3.397690e-84 3.249600e-83 14
PLA2G4A -10.924555 -10.924555 -10.924555 0.751729 389.418145 4.336939e-84 4.134733e-83 14
TMEM249 -10.830970 -10.830970 -10.830970 0.663328 377.661781 1.525539e-81 1.398857e-80 14
AL135905.2 -10.811029 -10.811029 -10.811029 0.637471 374.921455 5.982800e-81 5.419739e-80 14
CCT6B -10.729469 -10.729469 -10.729469 0.560234 364.884183 8.925023e-79 7.808094e-78 14
NPAS4 -10.669784 -10.669784 -10.669784 0.486103 357.006658 4.533972e-77 3.835185e-76 14
IL20RB -10.613298 -10.613298 -10.613298 0.439829 350.493393 1.166381e-75 9.622046e-75 14
KCNJ14 -10.584749 -10.584749 -10.584749 0.409276 346.975703 6.737820e-75 5.482987e-74 14
FUT5 -10.539339 -10.539339 -10.539339 0.366784 341.637204 9.647441e-74 7.673839e-73 14
SPINK1 -10.454414 -10.454414 -10.454414 0.276592 331.417874 1.573822e-71 1.194851e-70 14
Code
allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv')
allGenes = allGenes_series['0'].tolist()

Here we load the dictionary that associates to each GO term its genes:

Code
GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code

markers = markers[markers.FDR < 0.01]
markers['-log10(FDR)'] = -np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers
logFC.celltypes_leveledhEGCLC logFC.celltypes_leveledhPGCLC logFC.celltypes_levelediMeLC logCPM LR PValue FDR clusters -log10(FDR)
KRTDAP -12.016842 -12.016842 -12.016842 1.785090 554.411579 7.685864e-120 1.352689e-118 14 117.868802
PVALB -11.661893 -11.661893 -11.661893 1.446665 492.726668 1.798396e-106 2.575912e-105 14 104.589069
CYGB -11.590367 -11.590367 -11.590367 1.388522 481.503036 4.865001e-104 6.712024e-103 14 102.173146
BCHE -11.552081 -11.552081 -11.552081 1.338340 474.309595 1.761424e-102 2.362276e-101 14 100.626669
NFIA -11.543503 -11.543503 -11.543503 1.338488 473.452072 2.701975e-102 3.599535e-101 14 100.443754
NPY5R -11.522267 -11.522267 -11.522267 1.332806 470.944021 9.443850e-102 1.244287e-100 14 99.905080
NPW -11.516360 -11.516360 -11.516360 1.313345 469.078150 2.395850e-101 3.136029e-100 14 99.503620
GNMT -11.505465 -11.505465 -11.505465 1.299169 467.082914 6.483327e-101 8.431134e-100 14 99.074114
FIBCD1 -11.444936 -11.444936 -11.444936 1.242440 458.622018 4.416779e-99 5.568760e-98 14 97.254242
CCDC78 -11.407576 -11.407576 -11.407576 1.215570 453.788973 4.923766e-98 6.105368e-97 14 96.214288
AURKC -11.389224 -11.389224 -11.389224 1.180732 450.515009 2.521530e-97 3.085212e-96 14 95.510715
BHLHE41 -11.355354 -11.355354 -11.355354 1.165658 446.563530 1.810559e-96 2.181941e-95 14 94.661157
SLC16A12 -11.336495 -11.336495 -11.336495 1.159348 444.463544 5.161781e-96 6.164862e-95 14 94.210077
FAM3B -11.253109 -11.253109 -11.253109 1.081410 433.167816 1.445677e-93 1.652615e-92 14 91.781828
SLC30A3 -11.240689 -11.240689 -11.240689 1.059236 431.063008 4.131157e-93 4.700121e-92 14 91.327891
CA7 -11.213118 -11.213118 -11.213118 1.024725 427.018972 3.105887e-92 3.490633e-91 14 90.457096
C1QTNF6 -11.175776 -11.175776 -11.175776 0.989791 422.055403 3.693889e-91 4.056519e-90 14 89.391846
DDX60 -11.165692 -11.165692 -11.165692 0.990941 421.148868 5.805916e-91 6.341075e-90 14 89.197837
MCOLN2 -11.165931 -11.165931 -11.165931 0.983494 420.869763 6.673202e-91 7.275063e-90 14 89.138163
KCNQ4 -11.157494 -11.157494 -11.157494 0.975465 419.749288 1.166987e-90 1.271084e-89 14 88.895826
CST6 -11.144914 -11.144914 -11.144914 0.968273 418.274329 2.435504e-90 2.640772e-89 14 88.578269
ALOX12 -11.132350 -11.132350 -11.132350 0.948100 416.282664 6.577212e-90 7.074039e-89 14 88.150333
REXO5 -11.126199 -11.126199 -11.126199 0.942958 415.499184 9.722216e-90 1.042858e-88 14 87.981775
HRH3 -11.115619 -11.115619 -11.115619 0.952246 414.882838 1.322155e-89 1.414423e-88 14 87.849421
ARMC4 -11.105176 -11.105176 -11.105176 0.931578 413.075683 3.256548e-89 3.459146e-88 14 87.461031
HRC -11.092823 -11.092823 -11.092823 0.911069 411.104743 8.703784e-89 9.196437e-88 14 87.036380
DHDH -11.091171 -11.091171 -11.091171 0.909455 410.886547 9.704519e-89 1.024480e-87 14 86.989497
RASGRP1 -11.083438 -11.083438 -11.083438 0.917490 410.487358 1.184260e-88 1.247996e-87 14 86.903787
SHISA6 -11.054033 -11.054033 -11.054033 0.888521 406.606404 8.206021e-88 8.513126e-87 14 86.069911
MCTP2 -11.056345 -11.056345 -11.056345 0.872171 406.173908 1.018164e-87 1.055356e-86 14 85.976601
CLIC6 -11.041824 -11.041824 -11.041824 0.879920 405.137714 1.707153e-87 1.758881e-86 14 85.754764
CFP -11.040070 -11.040070 -11.040070 0.846821 403.674026 3.542634e-87 3.634379e-86 14 85.439570
DPP4 -11.026833 -11.026833 -11.026833 0.861772 403.037524 4.866312e-87 4.971093e-86 14 85.303548
CDKL2 -11.028050 -11.028050 -11.028050 0.849949 402.683933 5.804861e-87 5.914749e-86 14 85.228064
HCLS1 -11.023099 -11.023099 -11.023099 0.819798 401.057680 1.306332e-86 1.323199e-85 14 84.878375
HMX2 -11.003578 -11.003578 -11.003578 0.865132 401.044914 1.314676e-86 1.330528e-85 14 84.875976
GRIN1 -11.013697 -11.013697 -11.013697 0.837658 400.880668 1.426908e-86 1.442896e-85 14 84.840765
FBXL2 -11.009009 -11.009009 -11.009009 0.848432 400.874333 1.431424e-86 1.446244e-85 14 84.839758
ALS2CL -11.013210 -11.013210 -11.013210 0.834624 400.717361 1.547995e-86 1.560083e-85 14 84.806852
MICB -11.008766 -11.008766 -11.008766 0.831651 400.192631 2.011084e-86 2.023390e-85 14 84.693920
KRBOX4 -11.008902 -11.008902 -11.008902 0.826818 400.016898 2.195307e-86 2.206890e-85 14 84.656219
GUCA1B -10.986123 -10.986123 -10.986123 0.805985 397.116645 9.326200e-86 9.259089e-85 14 84.033432
AZU1 -10.955768 -10.955768 -10.955768 0.765337 392.777428 8.120450e-85 7.917933e-84 14 83.101388
ADAMTS3 -10.945374 -10.945374 -10.945374 0.789280 392.756470 8.205772e-85 7.994633e-84 14 83.097201
ADORA1 -10.920705 -10.920705 -10.920705 0.773319 389.907541 3.397690e-84 3.249600e-83 14 82.488170
PLA2G4A -10.924555 -10.924555 -10.924555 0.751729 389.418145 4.336939e-84 4.134733e-83 14 82.383553
TMEM249 -10.830970 -10.830970 -10.830970 0.663328 377.661781 1.525539e-81 1.398857e-80 14 79.854227
AL135905.2 -10.811029 -10.811029 -10.811029 0.637471 374.921455 5.982800e-81 5.419739e-80 14 79.266022
CCT6B -10.729469 -10.729469 -10.729469 0.560234 364.884183 8.925023e-79 7.808094e-78 14 77.107455
NPAS4 -10.669784 -10.669784 -10.669784 0.486103 357.006658 4.533972e-77 3.835185e-76 14 75.416214
IL20RB -10.613298 -10.613298 -10.613298 0.439829 350.493393 1.166381e-75 9.622046e-75 14 74.016733
KCNJ14 -10.584749 -10.584749 -10.584749 0.409276 346.975703 6.737820e-75 5.482987e-74 14 73.260983
FUT5 -10.539339 -10.539339 -10.539339 0.366784 341.637204 9.647441e-74 7.673839e-73 14 72.114987
SPINK1 -10.454414 -10.454414 -10.454414 0.276592 331.417874 1.573822e-71 1.194851e-70 14 69.922686

3.0.1 All regulated

Code
all_sign = markers.index.tolist()
allSelected = allGenes_series['0'].isin(all_sign).astype('int').tolist()

4 topGO

4.1 All significant

Code
%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}
Code
%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([10903,    50,    10,  1556], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py
GO.ID Term Annotated Significant Expected weight Scores
0 GO:0051956 negative regulation of amino acid transp... 10 3 0.05 1.1e-05 0.000011
1 GO:0014048 regulation of glutamate secretion 13 3 0.06 2.5e-05 0.000025
2 GO:1903792 negative regulation of anion transport 14 3 0.06 3.2e-05 0.000032
3 GO:0032228 regulation of synaptic transmission, GAB... 25 4 0.11 3.7e-05 0.000037
4 GO:0002673 regulation of acute inflammatory respons... 18 3 0.08 7.1e-05 0.000071
... ... ... ... ... ... ... ...
5697 GO:2001251 negative regulation of chromosome organi... 88 0 0.40 1.00000 1.000000
5698 GO:2001252 positive regulation of chromosome organi... 93 0 0.43 1.00000 1.000000
5699 GO:2001258 negative regulation of cation channel ac... 23 0 0.11 1.00000 1.000000
5700 GO:2001259 positive regulation of cation channel ac... 43 0 0.20 1.00000 1.000000
5701 GO:2001267 regulation of cysteine-type endopeptidas... 13 0 0.06 1.00000 1.000000

5702 rows × 7 columns

Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]
Code
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
Code
intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)
Code
%%R -i folder
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')
ggsave(file=paste0(folder, "TopGO_results_BP.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_BP.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11203,    49,    10,   260], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_MF.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_MF.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)
Code
%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)
Code
%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")
Code
scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData
array([11323,    50,    10,   250], dtype=int32)
Code
#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))
Code
results_table_py = ro.conversion.rpy2py(results_table)
Code
scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]
Code
scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
Code
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)
Code
%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_CC.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code
%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes:_in_Term_results_CC.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code
%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.1.0.1 Reactome

Code
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR
Code
rank_copy
-log10(FDR) pval
KRTDAP 117.868802 1.352689e-118
PVALB 104.589069 2.575912e-105
CYGB 102.173146 6.712024e-103
BCHE 100.626669 2.362276e-101
NFIA 100.443754 3.599535e-101
NPY5R 99.905080 1.244287e-100
NPW 99.503620 3.136029e-100
GNMT 99.074114 8.431134e-100
FIBCD1 97.254242 5.568760e-98
CCDC78 96.214288 6.105368e-97
AURKC 95.510715 3.085212e-96
BHLHE41 94.661157 2.181941e-95
SLC16A12 94.210077 6.164862e-95
FAM3B 91.781828 1.652615e-92
SLC30A3 91.327891 4.700121e-92
CA7 90.457096 3.490633e-91
C1QTNF6 89.391846 4.056519e-90
DDX60 89.197837 6.341075e-90
MCOLN2 89.138163 7.275063e-90
KCNQ4 88.895826 1.271084e-89
CST6 88.578269 2.640772e-89
ALOX12 88.150333 7.074039e-89
REXO5 87.981775 1.042858e-88
HRH3 87.849421 1.414423e-88
ARMC4 87.461031 3.459146e-88
HRC 87.036380 9.196437e-88
DHDH 86.989497 1.024480e-87
RASGRP1 86.903787 1.247996e-87
SHISA6 86.069911 8.513126e-87
MCTP2 85.976601 1.055356e-86
CLIC6 85.754764 1.758881e-86
CFP 85.439570 3.634379e-86
DPP4 85.303548 4.971093e-86
CDKL2 85.228064 5.914749e-86
HCLS1 84.878375 1.323199e-85
HMX2 84.875976 1.330528e-85
GRIN1 84.840765 1.442896e-85
FBXL2 84.839758 1.446244e-85
ALS2CL 84.806852 1.560083e-85
MICB 84.693920 2.023390e-85
KRBOX4 84.656219 2.206890e-85
GUCA1B 84.033432 9.259089e-85
AZU1 83.101388 7.917933e-84
ADAMTS3 83.097201 7.994633e-84
ADORA1 82.488170 3.249600e-83
PLA2G4A 82.383553 4.134733e-83
TMEM249 79.854227 1.398857e-80
AL135905.2 79.266022 5.419739e-80
CCT6B 77.107455 7.808094e-78
NPAS4 75.416214 3.835185e-76
IL20RB 74.016733 9.622046e-75
KCNJ14 73.260983 5.482987e-74
FUT5 72.114987 7.673839e-73
SPINK1 69.922686 1.194851e-70
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)
No significant term was found
0
Code
intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)
Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.1.0.2 KEGG

Code
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
Code
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
Code
intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)
Code
if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)
Code
if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)