import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools

from tqdm import tqdm

import decoupler as dc
import sys

sys.path.append("./../../../../utilities_folder/")
from utilities import save_object, load_object, intTable, plotGenesInTerm, run_ora_catchErrors, getAnnGenes, filter_by_expr


import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
from scipy import stats

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython


%matplotlib inline
sc.set_figure_params(dpi = 600, fontsize = 20)

plt.rcParams['pdf.fonttype'] = 'truetype'

cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)

color_palette = {
    'iMeLC': '#377eb8',
    'hEGCLC': '#ff7f00',
    'hPGCLC': '#4daf4a',
    'hiPSC': '#dd1c77',
    'HEP': '#a65628',
    'MLC': '#f781bf',
    'EndLC': '#999999',
     'AmLC': '#80b1d3',
}


%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment

.libPaths(loc)

library('topGO')
library('edgeR')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)


edgeR_topTags = ro.r['topTags']
edgeR_glmLRT = ro.r['glmLRT']
edgeR_glmQLFTest = ro.r['glmQLFTest']
as_data_frame = ro.r['as.data.frame']
rprint = rpy2.robjects.globalenv.find("print")


#| echo: false

default_units = 'in' 
default_res = 300 
default_width = 10
default_height = 9

import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics

def new_setup_graphics(self, args):
    if getattr(args, 'units') is not None:
        if args.units != default_units: # a different units argument was passed, do not apply defaults
            return old_setup_graphics(self, args)
    args.units = default_units
    if getattr(args, 'res') is None:
        args.res = default_res
    if getattr(args, 'width') is None:
        args.width = default_width
    if getattr(args, 'height') is None:
        args.height = default_height        
    return old_setup_graphics(self, args)

rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


tables_folder = './tables/'


adata = sc.read("./../01_Annotation/2_All_WithCellCycle.h5ad")
adata

AnnData object with n_obs × n_vars = 27925 × 14582
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes', 'S_score', 'G2M_score', 'phase'
    var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable'
    uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'phase_colors', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities', 'distances'


GO2gene = load_object('./../../../../data/GO2gene.pickle')


adata_sub = adata[adata.obs.CellTypes.isin(['hEGCLC', 'hiPSC'])].copy()
adata_sub

AnnData object with n_obs × n_vars = 14633 × 14582
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes', 'S_score', 'G2M_score', 'phase'
    var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable'
    uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'phase_colors', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities', 'distances'


sc.pp.filter_genes(adata_sub, min_cells = 100)
adata_sub

AnnData object with n_obs × n_vars = 14633 × 13921
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes', 'S_score', 'G2M_score', 'phase'
    var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable'
    uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'phase_colors', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities', 'distances'


adata_bulk = dc.get_pseudobulk(adata_sub, sample_col='sample_id', groups_col='CellTypes', layer='raw', min_prop=0, min_smpls = 2)
adata_bulk.layers['raw'] = adata_bulk.X.copy()
adata_bulk

AnnData object with n_obs × n_vars = 5 × 13921
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'scDblFinder_class', 'CellTypes'
    layers: 'raw'


adata_bulk.obs['test'] = np.where(adata_bulk.obs.CellTypes.isin(['hiPSC', 'hEGCLC']), 1, 0) 
adata_bulk = adata_bulk[adata_bulk.obs.test == 1]

genes_to_keep = filter_by_expr(adata_bulk, group = 'CellTypes', min_count = 10, min_total_count = 20)
adata_bulk = adata_bulk[:, genes_to_keep]

sc.pp.normalize_total(adata_bulk, target_sum = 1e6)
sc.pp.log1p(adata_bulk)

obs = adata_bulk.obs
adata_bulk

AnnData object with n_obs × n_vars = 5 × 13921
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'scDblFinder_class', 'CellTypes', 'test'
    uns: 'log1p'
    layers: 'raw'


del adata_bulk.obs["notes"]
del adata_bulk.obs["mosaic"]
del adata_bulk.obs["total_counts_ribo"]
del adata_bulk.obs["log1p_total_counts_ribo"]
del adata_bulk.obs["pct_counts_ribo"]


adata_bulk.write('3_Pseudobulk_hiPSC_EG.h5ad')


len(genes_to_keep)

13921


adata_bulk.obs.CellTypes

EG2_hEGCLC          hEGCLC
EG31_hEGCLC         hEGCLC
EG9_hEGCLC          hEGCLC
IEO_13_A_R_hiPSC     hiPSC
ht15_FIX_hiPSC       hiPSC
Name: CellTypes, dtype: category
Categories (2, object): ['hEGCLC', 'hiPSC']


total = pd.DataFrame(adata_bulk.layers['raw'].T, columns = adata_bulk.obs_names, index = adata_bulk.var_names)

total.index = adata_bulk.var_names
total.columns = adata_bulk.obs_names
total


ct_map = {'EG2':'hEGCLC', 
 'EG31':'hEGCLC', 
 'EG9':'hEGCLC', 
 'R':'hiPSC', 
 'FIX':'hiPSC'}


celltypes = total.columns.map(lambda x: x.split('_')[-1])
celltypes

Index(['hEGCLC', 'hEGCLC', 'hEGCLC', 'hiPSC', 'hiPSC'], dtype='object')


obs['CellTypes'] = obs['CellTypes'].astype('str')


for col in obs.columns:
    if obs[col].dtype == 'object':
        obs[col] = obs[col].astype('str')


sns.heatmap(total.corr(method = 'spearman'), vmin = 0, vmax = 1)

<AxesSubplot:>


%%R -i celltypes -i total

celltypes <- relevel(as.factor(celltypes), ref="hiPSC")
print(celltypes)
print(total[0:10, ])

mm <- model.matrix(~ celltypes, data = total)
colnames(mm)

[1] hEGCLC hEGCLC hEGCLC hiPSC  hiPSC 
Levels: hiPSC hEGCLC
        EG2_hEGCLC EG31_hEGCLC EG9_hEGCLC IEO_13_A_R_hiPSC ht15_FIX_hiPSC
A2ML1          153         206        122              142            139
A4GALT        1347        1298       1009              991           1108
AAAS          5013        4689       5185             7215           9254
AACS          5741        4033       6260             5337           5726
AADACL3        175         128        205               35             16
AADAT          772         563        847              767           1127
AAGAB         6122        4680       6805             5934           7537
AAK1          2643        2105       3045             1678           2407
AAMDC         3207        3509       2640             3747           4253
AAMP         17708       14792      17470            14798          22379
[1] "(Intercept)"     "celltypeshEGCLC"


%%R

ncol(mm)

[1] 2


%%R -i total -o dds

row.names(mm) <- colnames(total)
dds <- calcNormFactors(DGEList(total))
dds <- estimateDisp(dds, mm)
dds <- glmFit(dds, mm)


%%R
names(dds)

 [1] "coefficients"          "fitted.values"         "deviance"             
 [4] "method"                "counts"                "unshrunk.coefficients"
 [7] "df.residual"           "design"                "offset"               
[10] "dispersion"            "prior.count"           "samples"              
[13] "prior.df"              "AveLogCPM"


n_covariates = 2


%%R -o covar_names

covar_names <- colnames(mm)


table_list = []

for i in range(n_covariates):
    
    deg_table = edgeR_topTags(edgeR_glmLRT(dds, coef=i+1), len(total))
    deg_table = as_data_frame(deg_table)
    #deg_table = rpy2.robjects.pandas2ri.rpy2py(deg_table)
    table_list.append(deg_table)
    
    n_significant = len(deg_table[(deg_table.FDR < 0.01)])
    print(covar_names[i], n_significant)

(Intercept) 13921
celltypeshEGCLC 890


deg_table['Gene'] = deg_table.index
deg_table['FDR'] = deg_table['FDR'].replace(0.0, deg_table['FDR'][deg_table['FDR'] != 0.0].min())
deg_table['-log10(FDR)'] = -np.log10(deg_table.FDR)


intTable(deg_table, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_total.xlsx', save = True)


allGenes = deg_table.index.tolist()
AllFDRScores = deg_table.FDR.tolist()


deg_table_filtered = deg_table[deg_table['FDR'] < 0.01]
deg_table_filtered = deg_table_filtered[np.abs(deg_table_filtered['logFC']) > 2]
len(deg_table_filtered)

116


#sns.color_palette(adata_bulk.uns['CellTypes_colors'])

adata_bulk.uns['CellTypes_colors'] = []
adata_bulk.uns['CellTypes_colors'].append(color_palette['hEGCLC'])
adata_bulk.uns['CellTypes_colors'].append(color_palette['hiPSC'])


sc.pl.heatmap(adata_bulk, 
              var_names = deg_table_filtered[deg_table_filtered.logFC > 0].index,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (5,5), standard_scale = 'var', save='_DEGs_up_hEGCL.pdf')


sc.pl.heatmap(adata_bulk, 
              var_names = deg_table_filtered[deg_table_filtered.logFC < -2].index,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (5,10), standard_scale = 'var', save='_DEGs_up_hiPSC_vs_hEGCL.pdf')


up_reg = deg_table_filtered[deg_table_filtered.logFC > 2]
up_regGenes = up_reg.index.tolist()


print(f'The number of upregulated genes is {len(up_regGenes)}')

The number of upregulated genes is 82


allUpSelected = deg_table.index.isin(up_regGenes).astype('int').tolist()


intTable(up_reg, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_upregulated.xlsx', save = True)


down_reg = deg_table_filtered[deg_table_filtered.logFC < 0]
down_regGenes = down_reg.index.tolist()


print(f'The number of downregulated genes is {len(down_regGenes)}')

The number of downregulated genes is 34


intTable(down_reg, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_downregulated.xlsx', save = True)


allDownSelected = deg_table.index.isin(down_regGenes).astype('int').tolist()


all_sign = up_regGenes + down_regGenes
allSelected = deg_table.index.isin(all_sign).astype('int').tolist()


print(f'The total number of modulated genes is {len(all_sign)}')

The total number of modulated genes is 116


intTable(deg_table_filtered, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_filtered.xlsx', save = True)


df_ordered = pd.DataFrame(adata_bulk[:,up_regGenes].X[[0, 1, 2]]).T
df_ordered.index = adata_bulk[:,up_regGenes].var_names
df_ordered['Median'] = df_ordered.median(axis = 1)
df_ordered= df_ordered.sort_values(by = 'Median')


sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index, 
              groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10), 
               standard_scale='obs')


sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index, 
              groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10), 
               standard_scale='var')


sc.pl.heatmap(adata_bulk, 
              var_names = df_ordered.index,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (5,10), standard_scale = 'obs')


sc.pl.heatmap(adata_bulk, 
              var_names = df_ordered.index,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (5,10), standard_scale = 'var')


df_ordered = pd.DataFrame(adata_bulk[:,down_regGenes].X[[0, 1, 2]]).T
df_ordered.index = adata_bulk[:,down_regGenes].var_names
df_ordered['Median'] = df_ordered.median(axis = 1)
df_ordered= df_ordered.sort_values(by = 'Median')


sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index, 
              groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10), 
               standard_scale='obs')


sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index, 
              groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10), 
               standard_scale='var')


sc.pl.heatmap(adata_bulk, 
              var_names = df_ordered.index,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (5,10), standard_scale = 'obs')


sc.pl.heatmap(adata_bulk, 
              var_names = df_ordered.index,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (5,10), standard_scale = 'var')


print(f'The number of all filtered DEGs in hEGCLC vs hiPSC is {len(deg_table_filtered)}')
print(f'The number of upregulated genes in hEGCLC vs hiPSC is {len(up_regGenes)}')
print(f'The number of downregulated genes in hEGCLC vs hiPSC is {len(down_regGenes)}')

The number of all filtered DEGs in hEGCLC vs hiPSC is 116
The number of upregulated genes in hEGCLC vs hiPSC is 82
The number of downregulated genes in hEGCLC vs hiPSC is 34


%%R -i allUpSelected -i allGenes -i up_regGenes


#allGenes_v <- c(allUpSelected)
allGenes_v <- factor(as.integer(allGenes %in% up_regGenes))
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)


geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')


selection <- function(allScores){return (as.logical(allScores))}


params <- list(GoEnTh = 2, GoPvalTh = 0.01)


len(up_regGenes)

82


up_regGenes[0:10]

['ZNF560',
 'TAF9B',
 'ZNF662',
 'CRISP2',
 'NKAPL',
 'PTPN20',
 'CRYAB',
 'HSPB2',
 'CTSF',
 'CYP4F22']


%%R
print(allGenes_v[0:10])

  ZNF560    TAF9B   ZNF662   CRISP2   TWIST2    NKAPL   PTPN20 HIST1H1D 
       1        1        1        1        0        1        1        0 
   CRYAB    HSPB2 
       1        1 
Levels: 0 1


%%R -o ResBPAll -o ResSel
ResBPAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_BP, ontology='BP', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, maxAnn = 500)

ResSel <- ResBPAll$ResSel


%%R

ResBPAll$GOdata

------------------------- topGOdata object -------------------------

 Description:
   -  BP allGenes_v 

 Ontology:
   -  BP 

 13921 available genes (all genes from the array):
   - symbol:  ZNF560 TAF9B ZNF662 CRISP2 TWIST2  ...
   - 82  significant genes. 

 12478 feasible genes (genes that can be used in the analysis):
   - symbol:  ZNF560 TAF9B ZNF662 TWIST2 NKAPL  ...
   - 71  significant genes. 

 GO graph (nodes with at least  15  genes):
   - a graph with directed edges
   - number of nodes = 4938 
   - number of edges = 10661 

------------------------- topGOdata object -------------------------


len(up_regGenes)

82


up_regGenes[0:10]

['ZNF560',
 'TAF9B',
 'ZNF662',
 'CRISP2',
 'NKAPL',
 'PTPN20',
 'CRYAB',
 'HSPB2',
 'CTSF',
 'CYP4F22']


%%R
print(allGenes_v[0:10])

  ZNF560    TAF9B   ZNF662   CRISP2   TWIST2    NKAPL   PTPN20 HIST1H1D 
       1        1        1        1        0        1        1        0 
   CRYAB    HSPB2 
       1        1 
Levels: 0 1


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

15


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder, fileName = 'GO_upregulated_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)


#| echo: False

default_width = 8
default_height = 4
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R
image = bubbleplot(ResSel, Ont = 'BP', fillCol = 'red', terms = 5)
ggsave(file="./figures/GO_BP_up_hEGCLC_vs_hiPSC.pdf", plot=image, width=8, height=4)

bubbleplot(ResSel, Ont = 'BP', fillCol = 'red', terms = 5)


#| echo: False

default_width = 12
default_height = 4
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered

image = plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
                            fillCol='red')
ggsave(file="./figures/Genes_in_term_BP_up_hEGCLC_vs_hiPSC.pdf", plot=image, width=12, height=4)

plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=5, ngenes=12,
                             fillCol='red')


%%R -i tables_folder
saveGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms = 20, 
                path = './tables/genesInTerm_hiPSC_hEGCLCs_GO_BP_uphEGCLC.csv', 
                )


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -o ResMFAll -o ResSel
ResMFAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_MF, ontology='MF', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)

ResSel <- ResMFAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

9


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder , fileName = 'GO_upregulated_hEGCLCs_vs_hiPSC_MF.xlsx', save = True)


%%R
bubbleplot(ResSel, Ont = 'MF', fillCol = 'red')


#| echo: False

default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered
plotGenesInTerm_v3(ResMFAll$ResSel, ResMFAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
                             fillCol='red')


%%R
saveGenesInTerm(ResMFAll$ResSel, ResMFAll$GOdata, nterms = 20, 
                path = './tables/genesInTerm_hiPSC_hEGCLCs_GO_MF_uphEGCLC.csv')


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -o ResCCAll -o ResSel
ResCCAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_CC, ontology='CC', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)

ResSel <- ResCCAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

6


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder, fileName = 'GO_upregulated_hEGCLCs_vs_hiPSC_CC.xlsx', save = True)


%%R
bubbleplot(ResSel, Ont = 'CC', fillCol = 'red')


#| echo: False

default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered
plotGenesInTerm_v3(ResCCAll$ResSel, ResCCAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
                             fillCol='red')


%%R
saveGenesInTerm(ResCCAll$ResSel, ResCCAll$GOdata, nterms = 20, path = './tables/genesInTerm_hiPSC_hEGCLCs_GO_CC_uphEGCLC.csv')


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


msigdb = dc.get_resource('MSigDB')
msigdb


curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


rank = pd.DataFrame(deg_table_filtered.logFC)
rank = rank[rank.logFC > 2]

rank_copy = rank.copy()
rank_copy['pval'] = deg_table_filtered.loc[rank.index].FDR


rank_copy


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found


intTable(results_table_py, folder = tables_folder , fileName = 'GO_up_hEGCLCs_vs_hiPSC_reactome.xlsx', save = True)


curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found


intTable(results_table_py, folder = tables_folder , fileName = 'GO_up_hEGCLCs_vs_hiPSC_kegg.xlsx', save = True)


curated = msigdb[msigdb['collection'].isin(['wikipathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found


intTable(results_table_py, folder = tables_folder , fileName = 'GO_up_hEGCLCs_vs_hiPSC_wikipathways.xlsx', save = True)


%%R -i allDownSelected -i allGenes

allGenes_v <- c(allDownSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
allGenes_v <- as.factor(allGenes_v)


geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')


%%R -o ResBPAll -o ResSel
ResBPAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_BP, ontology='BP', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='BPAll', maxAnn = 500)

ResSel <- ResBPAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

15


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder , fileName = 'GO_downregulated_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)


%%R
bubbleplot(ResSel, Ont = 'BP', fillCol = 'blue')


#| echo: False

default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered
plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
                             fillCol='blue')


%%R
saveGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_BP_upEG.csv')


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -o ResMFAll -o ResSel
ResMFAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_MF, ontology='MF', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)

ResSel <- ResMFAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

2


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder , fileName = 'GO_downregulated_hEGCLCs_vs_hiPSC_MF.xlsx', save = True)


%%R
bubbleplot(ResSel, Ont = 'MF', fillCol = 'blue')


#| echo: False

default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered
plotGenesInTerm_v3(ResMFAll$ResSel, ResMFAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
                             fillCol='blue')


%%R
saveGenesInTerm(ResSel, ResMFAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_MF_upEG.csv', 
                )


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -o ResCCAll -o ResSel
ResCCAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_CC, ontology='CC', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)

ResSel <- ResCCAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

0


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder , fileName = 'GO_downregulated_hEGCLCs_vs_hiPSC_CC.xlsx', save = True)


curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


rank = pd.DataFrame(deg_table_filtered.logFC)
rank = rank[rank.logFC < 0]

rank_copy = rank.copy()
rank_copy['pval'] = deg_table_filtered.loc[rank.index].FDR


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found


intTable(results_table_py, folder = tables_folder , fileName = 'GO_down_hEGCLCs_vs_hiPSC_reactome.xlsx', save = True)


curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
results_table_py

No significant term was found


#results_table_py = getAnnGenes(results = results_table_py, GO2gene = GO2gene['kegg_pathways'], DEGs = rank_copy)
#intTable(results_table_py, folder = tables_folder , fileName = 'GO_down_hEGCLCs_vs_hiPSC_kegg.xlsx', save = True)


#results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
#_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_down)


#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_kegg_downhEGCLC.csv.xlsx', save = True)


curated = msigdb[msigdb['collection'].isin(['wikipathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
#results_table_py = getAnnGenes(results = results_table_py, GO2gene = GO2gene, DEGs = rank_copy)

No significant term was found


#intTable(results_table_py, folder = tables_folder , fileName = 'GO_down_hEGCLCs_vs_hiPSC_wikipathways.xlsx', save = True)


#results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
#_, df = plotGenesInTerm(results_table_py, GO2gene, rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_down)


#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_wikipathways_downhEGCLC.csv.xlsx', save = True)


%%R -i allSelected -i allGenes

allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
allGenes_v <- as.factor(allGenes_v)

geneNames <- c(allGenes)

ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v), 
                           mapping='org.Hs.eg', ID='symbol')

selection <- function(allScores){return (as.logical(allScores))}


%%R -o ResBPAll -o ResSel
ResBPAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_BP, ontology='BP', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='BPAll', maxAnn = 500)

ResSel <- ResBPAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

15


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)


%%R

bubbleplot(ResSel, Ont = 'BP', fillCol = 'forestgreen')


#| echo: False

default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered
plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
                             fillCol='forestgreen')


%%R
saveGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_BP_allSignificant.csv', SE = deg_table_filtered)


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -o ResMFAll -o ResSel
ResMFAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_MF, ontology='MF', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)

ResSel <- ResMFAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

8


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_MF.xlsx', save = True)


%%R

bubbleplot(ResSel, Ont = 'MF', fillCol = 'forestgreen')


#| echo: False

default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered
plotGenesInTerm_v3(ResSel, ResMFAll$GOdata, DEGs = deg_table_filtered, nterms=12, ngenes=12,
                             fillCol='forestgreen')


%%R
saveGenesInTerm(ResSel, ResMFAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_MF_allSignificant.csv')


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -o ResCCAll -o ResSel
ResCCAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_CC, ontology='CC', 
                         desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher', 
                         EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
                         saveRes=FALSE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)

ResSel <- ResCCAll$ResSel


results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)

10


results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']


intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)


%%R

bubbleplot(ResSel, Ont = 'CC', fillCol = 'forestgreen')


#| echo: False

default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


%%R -i deg_table_filtered
plotGenesInTerm_v3(ResCCAll$ResSel, ResCCAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
                             fillCol='forestgreen')


%%R
saveGenesInTerm(ResCCAll$ResSel, ResCCAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_CC_allSignificant.csv', SE = deg_table_filtered)


#| echo: False

default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics


curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


rank = pd.DataFrame(deg_table_filtered.logFC)


rank_copy = rank.copy()
rank_copy['pval'] = deg_table_filtered.loc[rank.index].FDR


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found


intTable(results_table_py, folder = tables_folder , fileName = 'GO_all_hEGCLCs_vs_hiPSC_reactome.xlsx', save = True)


#_, df = plotGenesInTerm(results_table_py, GO2gene['reactome_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)


#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_Reactome_allSignificant.xlsx', save = True)


curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)

No significant term was found


intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_kegg.xlsx', save = True)


#_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)


#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_kegg_allSignificant.csv.xlsx', save = True)


curated = msigdb[msigdb['collection'].isin(['wikipathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]

aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()


results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
results_table_py = getAnnGenes(results = results_table_py, GO2gene = GO2gene, DEGs = rank_copy)

results_table_py = results_table_py[results_table_py['pvals'] < 0.05]
results_table_py = results_table_py[results_table_py['n_gene_annotated'] < 500]
results_table_py = results_table_py[results_table_py['n_gene_annotated'] > 15]
results_table_py = results_table_py[results_table_py['n_gene_significant'] > 4]


intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_wikipathways.xlsx', save = True)


_, df = plotGenesInTerm(results_table_py, GO2gene, rank_copy, n_top_terms = 1, n_top_genes = 15, cmap = cmap_all)


intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_wikipathways_allSignificant.csv.xlsx', save = True)

	EG2_hEGCLC	EG31_hEGCLC	EG9_hEGCLC	IEO_13_A_R_hiPSC	ht15_FIX_hiPSC
A2ML1	153.0	206.0	122.0	142.0	139.0
A4GALT	1347.0	1298.0	1009.0	991.0	1108.0
AAAS	5013.0	4689.0	5185.0	7215.0	9254.0
AACS	5741.0	4033.0	6260.0	5337.0	5726.0
AADACL3	175.0	128.0	205.0	35.0	16.0
...	...	...	...	...	...
ZXDC	2341.0	2347.0	2445.0	2548.0	3444.0
ZYG11A	165.0	231.0	99.0	517.0	754.0
ZYG11B	2746.0	2526.0	3095.0	2903.0	4624.0
ZYX	22777.0	16653.0	27183.0	12780.0	17644.0
ZZEF1	1280.0	1224.0	1550.0	1322.0	1814.0

	genesymbol	collection	geneset
0	MAFF	chemical_and_genetic_perturbations	BOYAULT_LIVER_CANCER_SUBCLASS_G56_DN
1	MAFF	chemical_and_genetic_perturbations	ELVIDGE_HYPOXIA_UP
2	MAFF	chemical_and_genetic_perturbations	NUYTTEN_NIPP1_TARGETS_DN
3	MAFF	immunesigdb	GSE17721_POLYIC_VS_GARDIQUIMOD_4H_BMDC_DN
4	MAFF	chemical_and_genetic_perturbations	SCHAEFFER_PROSTATE_DEVELOPMENT_12HR_UP
...	...	...	...
3838543	PRAMEF22	go_biological_process	GOBP_POSITIVE_REGULATION_OF_CELL_POPULATION_PR...
3838544	PRAMEF22	go_biological_process	GOBP_APOPTOTIC_PROCESS
3838545	PRAMEF22	go_biological_process	GOBP_REGULATION_OF_CELL_DEATH
3838546	PRAMEF22	go_biological_process	GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS
3838547	PRAMEF22	go_biological_process	GOBP_NEGATIVE_REGULATION_OF_CELL_DEATH

	logFC	pval
ZNF560	12.498048	1.275713e-42
TAF9B	5.954427	9.861447e-31
ZNF662	6.313032	1.226581e-29
CRISP2	10.095322	1.798185e-27
NKAPL	9.875911	2.885773e-26
...	...	...
CALHM5	2.473018	5.583475e-03
SCARA5	2.112675	7.137824e-03
MYOF	2.052413	7.162281e-03
TM4SF1	2.879223	8.117776e-03
SCIN	2.122390	9.619166e-03

Prepare environment¶

Set up folders¶

Load data¶

Subset on cell types of interest¶

Prepare pseudobulk¶

edgeR analysis¶

Prepare pseudobulk count and metadata¶

Correlation between celltypes expression¶

Set up model matrix¶

Run glm¶

Normalize counts¶

Get results¶

Save DEGs unfiltered¶

Filter results by logFC and LR¶

Select up and downregulated DEGs¶

Upregulated in hEGCLC¶

Downregulated in EGCLC (= up in iPSC)¶

All modulated genes¶

Heatmaps if DEGs¶

Upregulated in hEGCLCs - Downregulated in hiPSC¶

Single cell data¶

Pseudobulk¶

Downregulated in hEGCLCs - Upregulated in hiPSC¶

Single cell data¶

Pseudobulk¶

topGO¶

hEGCLC upregulated¶

Get annotations for upreg genes¶

Biological Process¶

Molecular Function¶

Cellular Components¶

Pathway annotation¶

Reactome¶

KEGG¶

Wikipathway¶

hEGCLCs downregulated¶

Get annotations for downreg genes¶

Biological Process¶

Molecular Function¶

Cellular Components¶

Pathway annotation¶

Reactome¶

KEGG¶

Wikipathway¶

All significant (up and down)¶

Get annotations¶

Biological Process¶

Molecular Function¶

Cellular Components¶

Pathway annotation¶

Reactome¶

KEGG¶

Wikipathway¶

Save¶