1 Load environment

Code

import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.setrecursionlimit(20000)

sys.path.append("./../../../../utilities_folder")
from utilities import load_object, intTable, plotGenesInTerm, getAnnGenes, run_ora_catchErrors

Set R environment with rpy2:

Code

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

Set up of graphical parameters for Python plots:

Code

%matplotlib inline
sc.set_figure_params(dpi = 300, fontsize = 20)

plt.rcParams['svg.fonttype'] = 'none'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

Set up of graphical parameters for R plots:

Code

default_units = 'in' 
default_res = 300
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units:
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)


rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics

Here the cell were we inject the parameters using Quarto renderer:

Code

# Injected Parameters
N = 3

Code

# Injected Parameters
N = 14

Import R libraries:

Code

%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)

Set output folders:

Code

output_folder = './'
folder = './tables/cluster_' + str(N) + '/'

2 Load data

Here we load the dataframe:

Code

markers = pd.read_excel(folder + 'genes_in_cluster_' + str(N) + '.xlsx', index_col = 0)
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters
KRTDAP	-12.016842	-12.016842	-12.016842	1.785090	554.411579	7.685864e-120	1.352689e-118	14
PVALB	-11.661893	-11.661893	-11.661893	1.446665	492.726668	1.798396e-106	2.575912e-105	14
CYGB	-11.590367	-11.590367	-11.590367	1.388522	481.503036	4.865001e-104	6.712024e-103	14
BCHE	-11.552081	-11.552081	-11.552081	1.338340	474.309595	1.761424e-102	2.362276e-101	14
NFIA	-11.543503	-11.543503	-11.543503	1.338488	473.452072	2.701975e-102	3.599535e-101	14
NPY5R	-11.522267	-11.522267	-11.522267	1.332806	470.944021	9.443850e-102	1.244287e-100	14
NPW	-11.516360	-11.516360	-11.516360	1.313345	469.078150	2.395850e-101	3.136029e-100	14
GNMT	-11.505465	-11.505465	-11.505465	1.299169	467.082914	6.483327e-101	8.431134e-100	14
FIBCD1	-11.444936	-11.444936	-11.444936	1.242440	458.622018	4.416779e-99	5.568760e-98	14
CCDC78	-11.407576	-11.407576	-11.407576	1.215570	453.788973	4.923766e-98	6.105368e-97	14
AURKC	-11.389224	-11.389224	-11.389224	1.180732	450.515009	2.521530e-97	3.085212e-96	14
BHLHE41	-11.355354	-11.355354	-11.355354	1.165658	446.563530	1.810559e-96	2.181941e-95	14
SLC16A12	-11.336495	-11.336495	-11.336495	1.159348	444.463544	5.161781e-96	6.164862e-95	14
FAM3B	-11.253109	-11.253109	-11.253109	1.081410	433.167816	1.445677e-93	1.652615e-92	14
SLC30A3	-11.240689	-11.240689	-11.240689	1.059236	431.063008	4.131157e-93	4.700121e-92	14
CA7	-11.213118	-11.213118	-11.213118	1.024725	427.018972	3.105887e-92	3.490633e-91	14
C1QTNF6	-11.175776	-11.175776	-11.175776	0.989791	422.055403	3.693889e-91	4.056519e-90	14
DDX60	-11.165692	-11.165692	-11.165692	0.990941	421.148868	5.805916e-91	6.341075e-90	14
MCOLN2	-11.165931	-11.165931	-11.165931	0.983494	420.869763	6.673202e-91	7.275063e-90	14
KCNQ4	-11.157494	-11.157494	-11.157494	0.975465	419.749288	1.166987e-90	1.271084e-89	14
CST6	-11.144914	-11.144914	-11.144914	0.968273	418.274329	2.435504e-90	2.640772e-89	14
ALOX12	-11.132350	-11.132350	-11.132350	0.948100	416.282664	6.577212e-90	7.074039e-89	14
REXO5	-11.126199	-11.126199	-11.126199	0.942958	415.499184	9.722216e-90	1.042858e-88	14
HRH3	-11.115619	-11.115619	-11.115619	0.952246	414.882838	1.322155e-89	1.414423e-88	14
ARMC4	-11.105176	-11.105176	-11.105176	0.931578	413.075683	3.256548e-89	3.459146e-88	14
HRC	-11.092823	-11.092823	-11.092823	0.911069	411.104743	8.703784e-89	9.196437e-88	14
DHDH	-11.091171	-11.091171	-11.091171	0.909455	410.886547	9.704519e-89	1.024480e-87	14
RASGRP1	-11.083438	-11.083438	-11.083438	0.917490	410.487358	1.184260e-88	1.247996e-87	14
SHISA6	-11.054033	-11.054033	-11.054033	0.888521	406.606404	8.206021e-88	8.513126e-87	14
MCTP2	-11.056345	-11.056345	-11.056345	0.872171	406.173908	1.018164e-87	1.055356e-86	14
CLIC6	-11.041824	-11.041824	-11.041824	0.879920	405.137714	1.707153e-87	1.758881e-86	14
CFP	-11.040070	-11.040070	-11.040070	0.846821	403.674026	3.542634e-87	3.634379e-86	14
DPP4	-11.026833	-11.026833	-11.026833	0.861772	403.037524	4.866312e-87	4.971093e-86	14
CDKL2	-11.028050	-11.028050	-11.028050	0.849949	402.683933	5.804861e-87	5.914749e-86	14
HCLS1	-11.023099	-11.023099	-11.023099	0.819798	401.057680	1.306332e-86	1.323199e-85	14
HMX2	-11.003578	-11.003578	-11.003578	0.865132	401.044914	1.314676e-86	1.330528e-85	14
GRIN1	-11.013697	-11.013697	-11.013697	0.837658	400.880668	1.426908e-86	1.442896e-85	14
FBXL2	-11.009009	-11.009009	-11.009009	0.848432	400.874333	1.431424e-86	1.446244e-85	14
ALS2CL	-11.013210	-11.013210	-11.013210	0.834624	400.717361	1.547995e-86	1.560083e-85	14
MICB	-11.008766	-11.008766	-11.008766	0.831651	400.192631	2.011084e-86	2.023390e-85	14
KRBOX4	-11.008902	-11.008902	-11.008902	0.826818	400.016898	2.195307e-86	2.206890e-85	14
GUCA1B	-10.986123	-10.986123	-10.986123	0.805985	397.116645	9.326200e-86	9.259089e-85	14
AZU1	-10.955768	-10.955768	-10.955768	0.765337	392.777428	8.120450e-85	7.917933e-84	14
ADAMTS3	-10.945374	-10.945374	-10.945374	0.789280	392.756470	8.205772e-85	7.994633e-84	14
ADORA1	-10.920705	-10.920705	-10.920705	0.773319	389.907541	3.397690e-84	3.249600e-83	14
PLA2G4A	-10.924555	-10.924555	-10.924555	0.751729	389.418145	4.336939e-84	4.134733e-83	14
TMEM249	-10.830970	-10.830970	-10.830970	0.663328	377.661781	1.525539e-81	1.398857e-80	14
AL135905.2	-10.811029	-10.811029	-10.811029	0.637471	374.921455	5.982800e-81	5.419739e-80	14
CCT6B	-10.729469	-10.729469	-10.729469	0.560234	364.884183	8.925023e-79	7.808094e-78	14
NPAS4	-10.669784	-10.669784	-10.669784	0.486103	357.006658	4.533972e-77	3.835185e-76	14
IL20RB	-10.613298	-10.613298	-10.613298	0.439829	350.493393	1.166381e-75	9.622046e-75	14
KCNJ14	-10.584749	-10.584749	-10.584749	0.409276	346.975703	6.737820e-75	5.482987e-74	14
FUT5	-10.539339	-10.539339	-10.539339	0.366784	341.637204	9.647441e-74	7.673839e-73	14
SPINK1	-10.454414	-10.454414	-10.454414	0.276592	331.417874	1.573822e-71	1.194851e-70	14

Code

allGenes_series = pd.read_csv('./tables/all_bkg_genes.csv')
allGenes = allGenes_series['0'].tolist()

Here we load the dictionary that associates to each GO term its genes:

Code

GO2gene = load_object('./../../../../data/GO2gene_complete.pickle')

3 Markers of cluster

We filter genes for the cluster under investigation based on the p-value adjusted that we then convert in -log(p-value adjusted):

Code


markers = markers[markers.FDR < 0.01]
markers['-log10(FDR)'] = -np.log10(markers.FDR)
markers = markers.replace(np.inf, markers[markers['-log10(FDR)'] != np.inf]['-log10(FDR)'].max())
markers

	logFC.celltypes_leveledhEGCLC	logFC.celltypes_leveledhPGCLC	logFC.celltypes_levelediMeLC	logCPM	LR	PValue	FDR	clusters	-log10(FDR)
KRTDAP	-12.016842	-12.016842	-12.016842	1.785090	554.411579	7.685864e-120	1.352689e-118	14	117.868802
PVALB	-11.661893	-11.661893	-11.661893	1.446665	492.726668	1.798396e-106	2.575912e-105	14	104.589069
CYGB	-11.590367	-11.590367	-11.590367	1.388522	481.503036	4.865001e-104	6.712024e-103	14	102.173146
BCHE	-11.552081	-11.552081	-11.552081	1.338340	474.309595	1.761424e-102	2.362276e-101	14	100.626669
NFIA	-11.543503	-11.543503	-11.543503	1.338488	473.452072	2.701975e-102	3.599535e-101	14	100.443754
NPY5R	-11.522267	-11.522267	-11.522267	1.332806	470.944021	9.443850e-102	1.244287e-100	14	99.905080
NPW	-11.516360	-11.516360	-11.516360	1.313345	469.078150	2.395850e-101	3.136029e-100	14	99.503620
GNMT	-11.505465	-11.505465	-11.505465	1.299169	467.082914	6.483327e-101	8.431134e-100	14	99.074114
FIBCD1	-11.444936	-11.444936	-11.444936	1.242440	458.622018	4.416779e-99	5.568760e-98	14	97.254242
CCDC78	-11.407576	-11.407576	-11.407576	1.215570	453.788973	4.923766e-98	6.105368e-97	14	96.214288
AURKC	-11.389224	-11.389224	-11.389224	1.180732	450.515009	2.521530e-97	3.085212e-96	14	95.510715
BHLHE41	-11.355354	-11.355354	-11.355354	1.165658	446.563530	1.810559e-96	2.181941e-95	14	94.661157
SLC16A12	-11.336495	-11.336495	-11.336495	1.159348	444.463544	5.161781e-96	6.164862e-95	14	94.210077
FAM3B	-11.253109	-11.253109	-11.253109	1.081410	433.167816	1.445677e-93	1.652615e-92	14	91.781828
SLC30A3	-11.240689	-11.240689	-11.240689	1.059236	431.063008	4.131157e-93	4.700121e-92	14	91.327891
CA7	-11.213118	-11.213118	-11.213118	1.024725	427.018972	3.105887e-92	3.490633e-91	14	90.457096
C1QTNF6	-11.175776	-11.175776	-11.175776	0.989791	422.055403	3.693889e-91	4.056519e-90	14	89.391846
DDX60	-11.165692	-11.165692	-11.165692	0.990941	421.148868	5.805916e-91	6.341075e-90	14	89.197837
MCOLN2	-11.165931	-11.165931	-11.165931	0.983494	420.869763	6.673202e-91	7.275063e-90	14	89.138163
KCNQ4	-11.157494	-11.157494	-11.157494	0.975465	419.749288	1.166987e-90	1.271084e-89	14	88.895826
CST6	-11.144914	-11.144914	-11.144914	0.968273	418.274329	2.435504e-90	2.640772e-89	14	88.578269
ALOX12	-11.132350	-11.132350	-11.132350	0.948100	416.282664	6.577212e-90	7.074039e-89	14	88.150333
REXO5	-11.126199	-11.126199	-11.126199	0.942958	415.499184	9.722216e-90	1.042858e-88	14	87.981775
HRH3	-11.115619	-11.115619	-11.115619	0.952246	414.882838	1.322155e-89	1.414423e-88	14	87.849421
ARMC4	-11.105176	-11.105176	-11.105176	0.931578	413.075683	3.256548e-89	3.459146e-88	14	87.461031
HRC	-11.092823	-11.092823	-11.092823	0.911069	411.104743	8.703784e-89	9.196437e-88	14	87.036380
DHDH	-11.091171	-11.091171	-11.091171	0.909455	410.886547	9.704519e-89	1.024480e-87	14	86.989497
RASGRP1	-11.083438	-11.083438	-11.083438	0.917490	410.487358	1.184260e-88	1.247996e-87	14	86.903787
SHISA6	-11.054033	-11.054033	-11.054033	0.888521	406.606404	8.206021e-88	8.513126e-87	14	86.069911
MCTP2	-11.056345	-11.056345	-11.056345	0.872171	406.173908	1.018164e-87	1.055356e-86	14	85.976601
CLIC6	-11.041824	-11.041824	-11.041824	0.879920	405.137714	1.707153e-87	1.758881e-86	14	85.754764
CFP	-11.040070	-11.040070	-11.040070	0.846821	403.674026	3.542634e-87	3.634379e-86	14	85.439570
DPP4	-11.026833	-11.026833	-11.026833	0.861772	403.037524	4.866312e-87	4.971093e-86	14	85.303548
CDKL2	-11.028050	-11.028050	-11.028050	0.849949	402.683933	5.804861e-87	5.914749e-86	14	85.228064
HCLS1	-11.023099	-11.023099	-11.023099	0.819798	401.057680	1.306332e-86	1.323199e-85	14	84.878375
HMX2	-11.003578	-11.003578	-11.003578	0.865132	401.044914	1.314676e-86	1.330528e-85	14	84.875976
GRIN1	-11.013697	-11.013697	-11.013697	0.837658	400.880668	1.426908e-86	1.442896e-85	14	84.840765
FBXL2	-11.009009	-11.009009	-11.009009	0.848432	400.874333	1.431424e-86	1.446244e-85	14	84.839758
ALS2CL	-11.013210	-11.013210	-11.013210	0.834624	400.717361	1.547995e-86	1.560083e-85	14	84.806852
MICB	-11.008766	-11.008766	-11.008766	0.831651	400.192631	2.011084e-86	2.023390e-85	14	84.693920
KRBOX4	-11.008902	-11.008902	-11.008902	0.826818	400.016898	2.195307e-86	2.206890e-85	14	84.656219
GUCA1B	-10.986123	-10.986123	-10.986123	0.805985	397.116645	9.326200e-86	9.259089e-85	14	84.033432
AZU1	-10.955768	-10.955768	-10.955768	0.765337	392.777428	8.120450e-85	7.917933e-84	14	83.101388
ADAMTS3	-10.945374	-10.945374	-10.945374	0.789280	392.756470	8.205772e-85	7.994633e-84	14	83.097201
ADORA1	-10.920705	-10.920705	-10.920705	0.773319	389.907541	3.397690e-84	3.249600e-83	14	82.488170
PLA2G4A	-10.924555	-10.924555	-10.924555	0.751729	389.418145	4.336939e-84	4.134733e-83	14	82.383553
TMEM249	-10.830970	-10.830970	-10.830970	0.663328	377.661781	1.525539e-81	1.398857e-80	14	79.854227
AL135905.2	-10.811029	-10.811029	-10.811029	0.637471	374.921455	5.982800e-81	5.419739e-80	14	79.266022
CCT6B	-10.729469	-10.729469	-10.729469	0.560234	364.884183	8.925023e-79	7.808094e-78	14	77.107455
NPAS4	-10.669784	-10.669784	-10.669784	0.486103	357.006658	4.533972e-77	3.835185e-76	14	75.416214
IL20RB	-10.613298	-10.613298	-10.613298	0.439829	350.493393	1.166381e-75	9.622046e-75	14	74.016733
KCNJ14	-10.584749	-10.584749	-10.584749	0.409276	346.975703	6.737820e-75	5.482987e-74	14	73.260983
FUT5	-10.539339	-10.539339	-10.539339	0.366784	341.637204	9.647441e-74	7.673839e-73	14	72.114987
SPINK1	-10.454414	-10.454414	-10.454414	0.276592	331.417874	1.573822e-71	1.194851e-70	14	69.922686

3.0.1 All regulated

Code

all_sign = markers.index.tolist()
allSelected = allGenes_series['0'].isin(all_sign).astype('int').tolist()

4 topGO

4.1 All significant

Code

%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}

Code

%%R
#print(lapply(ann_org_BP, count_genes))

GOdata <- new("topGOdata",
  ontology="BP",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_BP,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([10903,    50,    10,  1556], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py

	GO.ID	Term	Annotated	Significant	Expected	weight	Scores
0	GO:0051956	negative regulation of amino acid transp...	10	3	0.05	1.1e-05	0.000011
1	GO:0014048	regulation of glutamate secretion	13	3	0.06	2.5e-05	0.000025
2	GO:1903792	negative regulation of anion transport	14	3	0.06	3.2e-05	0.000032
3	GO:0032228	regulation of synaptic transmission, GAB...	25	4	0.11	3.7e-05	0.000037
4	GO:0002673	regulation of acute inflammatory respons...	18	3	0.08	7.1e-05	0.000071
...	...	...	...	...	...	...	...
5697	GO:2001251	negative regulation of chromosome organi...	88	0	0.40	1.00000	1.000000
5698	GO:2001252	positive regulation of chromosome organi...	93	0	0.43	1.00000	1.000000
5699	GO:2001258	negative regulation of cation channel ac...	23	0	0.11	1.00000	1.000000
5700	GO:2001259	positive regulation of cation channel ac...	43	0	0.20	1.00000	1.000000
5701	GO:2001267	regulation of cysteine-type endopeptidas...	13	0	0.06	1.00000	1.000000

5702 rows × 7 columns

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

Code

results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Scores)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']

Code

intTable(results_table_py, folder = folder, fileName = 'GO_BP_all.xlsx', save = True)

Code

%%R -i folder
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')
ggsave(file=paste0(folder, "TopGO_results_BP.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'BP', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_BP.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_BP_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="MF",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_MF,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11203,    49,    10,   260], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')
results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_MF_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected

image = bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_MF.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'MF', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes_in_Term_results_MF.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=15, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_MF_genesInTerm_all.xlsx'), SE = markers)

Code

%%R

GOdata <- new("topGOdata",
  ontology="CC",
  allGenes=allGenes_v,
  annot=annFUN.GO2genes,
  GO2genes=ann_org_CC,
  geneSel = selection,
  nodeSize=10)

Code

%%R -o results

results <- runTest(GOdata, algorithm="weight01",statistic="fisher")

Code

scores = ro.r.score(results)
score_names = ro.r(
'''
names(results@score)
'''
)
go_data = ro.r.GOdata

genesData = ro.r(
'''
geneData(results)
'''
)
genesData

array([11323,    50,    10,   250], dtype=int32)

Code

#num_summarize = min(100, len(score_names))
results_table = ro.r.GenTable(go_data, weight=results,
        orderBy="weight", topNodes=len(scores))

Code

results_table_py = ro.conversion.rpy2py(results_table)

Code

scores_py = ro.conversion.rpy2py(scores)
score_names = [i for i in score_names]

Code

scores_df = pd.DataFrame({'Scores': scores_py, 'GO.ID': score_names})
results_table_py = results_table_py.merge(scores_df, left_on = 'GO.ID', right_on = 'GO.ID')

Code

results_table_py = results_table_py[results_table_py['Scores'] < 0.05]
results_table_py = results_table_py[results_table_py['Annotated'] < 200]
results_table_py = results_table_py[results_table_py['Annotated'] > 15]

intTable(results_table_py, folder = folder, fileName = 'GO_CC_all.xlsx', save = True)

Code

%%R
Res <- GenTable(GOdata, weight=results,
        orderBy="weight", topNodes=length(score(results)))
#print(Res[0:10,])
colnames(Res) <- c("GO.ID", "Term", "Annotated", "Significant", "Expected", "Statistics")
Res$ER <- Res$Significant / Res$Expected
image = bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

ggsave(file=paste0(folder, "TopGO_results_CC.pdf"), plot=image, width=12, height=4)

bubbleplot(Res, Ont = 'CC', fillCol = 'forestgreen')

Code

%%R -i markers
image = plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

ggsave(file=paste0(folder, "Genes:_in_Term_results_CC.pdf"), plot=image, width=12, height=4)

plotGenesInTerm_v1(Res, GOdata, SE = markers, nterms=12, ngenes=12,
                             fillCol='forestgreen', log = TRUE)

Code

%%R -i markers -i folder
saveGenesInTerm(Res, GOdata, nterms = 20, path = paste0(folder,'GO_CC_genesInTerm_all.xlsx'), SE = markers)

4.1.0.1 Reactome

Code

curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

rank = pd.DataFrame(markers['-log10(FDR)'])

rank_copy = rank.copy()
rank_copy['pval'] = markers.loc[rank.index].FDR

Code

rank_copy

	-log10(FDR)	pval
KRTDAP	117.868802	1.352689e-118
PVALB	104.589069	2.575912e-105
CYGB	102.173146	6.712024e-103
BCHE	100.626669	2.362276e-101
NFIA	100.443754	3.599535e-101
NPY5R	99.905080	1.244287e-100
NPW	99.503620	3.136029e-100
GNMT	99.074114	8.431134e-100
FIBCD1	97.254242	5.568760e-98
CCDC78	96.214288	6.105368e-97
AURKC	95.510715	3.085212e-96
BHLHE41	94.661157	2.181941e-95
SLC16A12	94.210077	6.164862e-95
FAM3B	91.781828	1.652615e-92
SLC30A3	91.327891	4.700121e-92
CA7	90.457096	3.490633e-91
C1QTNF6	89.391846	4.056519e-90
DDX60	89.197837	6.341075e-90
MCOLN2	89.138163	7.275063e-90
KCNQ4	88.895826	1.271084e-89
CST6	88.578269	2.640772e-89
ALOX12	88.150333	7.074039e-89
REXO5	87.981775	1.042858e-88
HRH3	87.849421	1.414423e-88
ARMC4	87.461031	3.459146e-88
HRC	87.036380	9.196437e-88
DHDH	86.989497	1.024480e-87
RASGRP1	86.903787	1.247996e-87
SHISA6	86.069911	8.513126e-87
MCTP2	85.976601	1.055356e-86
CLIC6	85.754764	1.758881e-86
CFP	85.439570	3.634379e-86
DPP4	85.303548	4.971093e-86
CDKL2	85.228064	5.914749e-86
HCLS1	84.878375	1.323199e-85
HMX2	84.875976	1.330528e-85
GRIN1	84.840765	1.442896e-85
FBXL2	84.839758	1.446244e-85
ALS2CL	84.806852	1.560083e-85
MICB	84.693920	2.023390e-85
KRBOX4	84.656219	2.206890e-85
GUCA1B	84.033432	9.259089e-85
AZU1	83.101388	7.917933e-84
ADAMTS3	83.097201	7.994633e-84
ADORA1	82.488170	3.249600e-83
PLA2G4A	82.383553	4.134733e-83
TMEM249	79.854227	1.398857e-80
AL135905.2	79.266022	5.419739e-80
CCT6B	77.107455	7.808094e-78
NPAS4	75.416214	3.835185e-76
IL20RB	74.016733	9.622046e-75
KCNJ14	73.260983	5.482987e-74
FUT5	72.114987	7.673839e-73
SPINK1	69.922686	1.194851e-70

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
len(results_table_py)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'Reactome_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['reactome_pathways'], rank_copy)
    _, df = plotGenesInTerm(results = results_table_py, GO2gene = GO2gene['reactome_pathways'], DEGs = rank_copy, n_top_terms = 10, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_Reactome_all.xlsx', save = True)

4.1.0.2 KEGG

Code

curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 200].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()

Code

results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found

Code

intTable(results_table_py, folder = folder, fileName = 'KEGG_all.xlsx', save = True)

Code

if len(results_table_py) > 0:
    results_table_py = getAnnGenes(results_table_py, GO2gene['kegg_pathways'], rank_copy)
    _, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)

Code

if len(results_table_py) > 0:
    intTable(df, folder = folder, fileName = 'genesInTerm_KEGG_all.xlsx', save = True)