Load Python packages:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import itertools
from tqdm import tqdm
import decoupler as dc
import sys
sys.path.append("./../../../../utilities_folder/")
from utilities import save_object, load_object, intTable, plotGenesInTerm, run_ora_catchErrors, getAnnGenes, filter_by_expr
Set R environment with rpy2:
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro
from scipy import stats
sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
Set up parameters for Python plots:
%matplotlib inline
sc.set_figure_params(dpi = 600, fontsize = 20)
plt.rcParams['pdf.fonttype'] = 'truetype'
cmap_up = sns.light_palette("red", as_cmap=True)
cmap_down = sns.light_palette("blue", as_cmap=True)
cmap_all = sns.light_palette("seagreen", as_cmap=True)
color_palette = {
'iMeLC': '#377eb8',
'hEGCLC': '#ff7f00',
'hPGCLC': '#4daf4a',
'hiPSC': '#dd1c77',
'HEP': '#a65628',
'MLC': '#f781bf',
'EndLC': '#999999',
'AmLC': '#80b1d3',
}
Import R libraries:
%%R
source('./../../../../utilities_folder/GO_helper.r')
loc <- './../../../../R_loc' # pointing to the renv environment
.libPaths(loc)
library('topGO')
library('edgeR')
library('org.Hs.eg.db')
library(dplyr)
library(ggplot2)
Import some R functions directly in Python:
edgeR_topTags = ro.r['topTags']
edgeR_glmLRT = ro.r['glmLRT']
edgeR_glmQLFTest = ro.r['glmQLFTest']
as_data_frame = ro.r['as.data.frame']
rprint = rpy2.robjects.globalenv.find("print")
Set up some parameters for R plots:
#| echo: false
default_units = 'in'
default_res = 300
default_width = 10
default_height = 9
import rpy2
old_setup_graphics = rpy2.ipython.rmagic.RMagics.setup_graphics
def new_setup_graphics(self, args):
if getattr(args, 'units') is not None:
if args.units != default_units: # a different units argument was passed, do not apply defaults
return old_setup_graphics(self, args)
args.units = default_units
if getattr(args, 'res') is None:
args.res = default_res
if getattr(args, 'width') is None:
args.width = default_width
if getattr(args, 'height') is None:
args.height = default_height
return old_setup_graphics(self, args)
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
tables_folder = './tables/'
Here we load the anndata:
adata = sc.read("./../01_Annotation/2_All_WithCellCycle.h5ad")
adata
AnnData object with n_obs × n_vars = 27925 × 14582 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes', 'S_score', 'G2M_score', 'phase' var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable' uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'phase_colors', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap' varm: 'PCs' layers: 'raw' obsp: 'connectivities', 'distances'
Here we load the dictionary that associates to each GO term its genes:
GO2gene = load_object('./../../../../data/GO2gene.pickle')
Since we are interested in a comparison between hEGCLC and hiPSC, we build the pseudobulk on the subsetted AnnData from which we filter the genes that are not expressed in more than 100 cells.
adata_sub = adata[adata.obs.CellTypes.isin(['hEGCLC', 'hiPSC'])].copy()
adata_sub
AnnData object with n_obs × n_vars = 14633 × 14582 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes', 'S_score', 'G2M_score', 'phase' var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable' uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'phase_colors', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap' varm: 'PCs' layers: 'raw' obsp: 'connectivities', 'distances'
sc.pp.filter_genes(adata_sub, min_cells = 100)
adata_sub
AnnData object with n_obs × n_vars = 14633 × 13921 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes', 'S_score', 'G2M_score', 'phase' var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable' uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'phase_colors', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap' varm: 'PCs' layers: 'raw' obsp: 'connectivities', 'distances'
adata_bulk = dc.get_pseudobulk(adata_sub, sample_col='sample_id', groups_col='CellTypes', layer='raw', min_prop=0, min_smpls = 2)
adata_bulk.layers['raw'] = adata_bulk.X.copy()
adata_bulk
AnnData object with n_obs × n_vars = 5 × 13921 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'scDblFinder_class', 'CellTypes' layers: 'raw'
I filter out genes based on their expression in each group, based on the function at https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.filter_by_expr.html#decoupler.filter_by_expr
adata_bulk.obs['test'] = np.where(adata_bulk.obs.CellTypes.isin(['hiPSC', 'hEGCLC']), 1, 0)
adata_bulk = adata_bulk[adata_bulk.obs.test == 1]
genes_to_keep = filter_by_expr(adata_bulk, group = 'CellTypes', min_count = 10, min_total_count = 20)
adata_bulk = adata_bulk[:, genes_to_keep]
sc.pp.normalize_total(adata_bulk, target_sum = 1e6)
sc.pp.log1p(adata_bulk)
obs = adata_bulk.obs
adata_bulk
AnnData object with n_obs × n_vars = 5 × 13921 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'scDblFinder_class', 'CellTypes', 'test' uns: 'log1p' layers: 'raw'
del adata_bulk.obs["notes"]
del adata_bulk.obs["mosaic"]
del adata_bulk.obs["total_counts_ribo"]
del adata_bulk.obs["log1p_total_counts_ribo"]
del adata_bulk.obs["pct_counts_ribo"]
adata_bulk.write('3_Pseudobulk_hiPSC_EG.h5ad')
len(genes_to_keep)
13921
adata_bulk.obs.CellTypes
EG2_hEGCLC hEGCLC EG31_hEGCLC hEGCLC EG9_hEGCLC hEGCLC IEO_13_A_R_hiPSC hiPSC ht15_FIX_hiPSC hiPSC Name: CellTypes, dtype: category Categories (2, object): ['hEGCLC', 'hiPSC']
total = pd.DataFrame(adata_bulk.layers['raw'].T, columns = adata_bulk.obs_names, index = adata_bulk.var_names)
total.index = adata_bulk.var_names
total.columns = adata_bulk.obs_names
total
EG2_hEGCLC | EG31_hEGCLC | EG9_hEGCLC | IEO_13_A_R_hiPSC | ht15_FIX_hiPSC | |
---|---|---|---|---|---|
A2ML1 | 153.0 | 206.0 | 122.0 | 142.0 | 139.0 |
A4GALT | 1347.0 | 1298.0 | 1009.0 | 991.0 | 1108.0 |
AAAS | 5013.0 | 4689.0 | 5185.0 | 7215.0 | 9254.0 |
AACS | 5741.0 | 4033.0 | 6260.0 | 5337.0 | 5726.0 |
AADACL3 | 175.0 | 128.0 | 205.0 | 35.0 | 16.0 |
... | ... | ... | ... | ... | ... |
ZXDC | 2341.0 | 2347.0 | 2445.0 | 2548.0 | 3444.0 |
ZYG11A | 165.0 | 231.0 | 99.0 | 517.0 | 754.0 |
ZYG11B | 2746.0 | 2526.0 | 3095.0 | 2903.0 | 4624.0 |
ZYX | 22777.0 | 16653.0 | 27183.0 | 12780.0 | 17644.0 |
ZZEF1 | 1280.0 | 1224.0 | 1550.0 | 1322.0 | 1814.0 |
13921 rows × 5 columns
ct_map = {'EG2':'hEGCLC',
'EG31':'hEGCLC',
'EG9':'hEGCLC',
'R':'hiPSC',
'FIX':'hiPSC'}
celltypes = total.columns.map(lambda x: x.split('_')[-1])
celltypes
Index(['hEGCLC', 'hEGCLC', 'hEGCLC', 'hiPSC', 'hiPSC'], dtype='object')
obs['CellTypes'] = obs['CellTypes'].astype('str')
for col in obs.columns:
if obs[col].dtype == 'object':
obs[col] = obs[col].astype('str')
sns.heatmap(total.corr(method = 'spearman'), vmin = 0, vmax = 1)
<AxesSubplot:>
%%R -i celltypes -i total
celltypes <- relevel(as.factor(celltypes), ref="hiPSC")
print(celltypes)
print(total[0:10, ])
mm <- model.matrix(~ celltypes, data = total)
colnames(mm)
[1] hEGCLC hEGCLC hEGCLC hiPSC hiPSC Levels: hiPSC hEGCLC EG2_hEGCLC EG31_hEGCLC EG9_hEGCLC IEO_13_A_R_hiPSC ht15_FIX_hiPSC A2ML1 153 206 122 142 139 A4GALT 1347 1298 1009 991 1108 AAAS 5013 4689 5185 7215 9254 AACS 5741 4033 6260 5337 5726 AADACL3 175 128 205 35 16 AADAT 772 563 847 767 1127 AAGAB 6122 4680 6805 5934 7537 AAK1 2643 2105 3045 1678 2407 AAMDC 3207 3509 2640 3747 4253 AAMP 17708 14792 17470 14798 22379 [1] "(Intercept)" "celltypeshEGCLC"
%%R
ncol(mm)
[1] 2
%%R -i total -o dds
row.names(mm) <- colnames(total)
dds <- calcNormFactors(DGEList(total))
dds <- estimateDisp(dds, mm)
dds <- glmFit(dds, mm)
%%R
names(dds)
[1] "coefficients" "fitted.values" "deviance" [4] "method" "counts" "unshrunk.coefficients" [7] "df.residual" "design" "offset" [10] "dispersion" "prior.count" "samples" [13] "prior.df" "AveLogCPM"
n_covariates = 2
%%R -o covar_names
covar_names <- colnames(mm)
table_list = []
for i in range(n_covariates):
deg_table = edgeR_topTags(edgeR_glmLRT(dds, coef=i+1), len(total))
deg_table = as_data_frame(deg_table)
#deg_table = rpy2.robjects.pandas2ri.rpy2py(deg_table)
table_list.append(deg_table)
n_significant = len(deg_table[(deg_table.FDR < 0.01)])
print(covar_names[i], n_significant)
(Intercept) 13921 celltypeshEGCLC 890
Formatting of the table containing all the degs:
deg_table['Gene'] = deg_table.index
deg_table['FDR'] = deg_table['FDR'].replace(0.0, deg_table['FDR'][deg_table['FDR'] != 0.0].min())
deg_table['-log10(FDR)'] = -np.log10(deg_table.FDR)
intTable(deg_table, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_total.xlsx', save = True)
allGenes = deg_table.index.tolist()
AllFDRScores = deg_table.FDR.tolist()
Genes are filtered according to:
deg_table_filtered = deg_table[deg_table['FDR'] < 0.01]
deg_table_filtered = deg_table_filtered[np.abs(deg_table_filtered['logFC']) > 2]
len(deg_table_filtered)
116
#sns.color_palette(adata_bulk.uns['CellTypes_colors'])
adata_bulk.uns['CellTypes_colors'] = []
adata_bulk.uns['CellTypes_colors'].append(color_palette['hEGCLC'])
adata_bulk.uns['CellTypes_colors'].append(color_palette['hiPSC'])
sc.pl.heatmap(adata_bulk,
var_names = deg_table_filtered[deg_table_filtered.logFC > 0].index,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (5,5), standard_scale = 'var', save='_DEGs_up_hEGCL.pdf')
sc.pl.heatmap(adata_bulk,
var_names = deg_table_filtered[deg_table_filtered.logFC < -2].index,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (5,10), standard_scale = 'var', save='_DEGs_up_hiPSC_vs_hEGCL.pdf')
up_reg = deg_table_filtered[deg_table_filtered.logFC > 2]
up_regGenes = up_reg.index.tolist()
print(f'The number of upregulated genes is {len(up_regGenes)}')
The number of upregulated genes is 82
allUpSelected = deg_table.index.isin(up_regGenes).astype('int').tolist()
intTable(up_reg, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_upregulated.xlsx', save = True)
down_reg = deg_table_filtered[deg_table_filtered.logFC < 0]
down_regGenes = down_reg.index.tolist()
print(f'The number of downregulated genes is {len(down_regGenes)}')
The number of downregulated genes is 34
intTable(down_reg, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_downregulated.xlsx', save = True)
allDownSelected = deg_table.index.isin(down_regGenes).astype('int').tolist()
all_sign = up_regGenes + down_regGenes
allSelected = deg_table.index.isin(all_sign).astype('int').tolist()
print(f'The total number of modulated genes is {len(all_sign)}')
The total number of modulated genes is 116
intTable(deg_table_filtered, folder = tables_folder, fileName = 'Bulk_hEGCLCs_vs_hiPSC_filtered.xlsx', save = True)
df_ordered = pd.DataFrame(adata_bulk[:,up_regGenes].X[[0, 1, 2]]).T
df_ordered.index = adata_bulk[:,up_regGenes].var_names
df_ordered['Median'] = df_ordered.median(axis = 1)
df_ordered= df_ordered.sort_values(by = 'Median')
Scaled by obs
-> for each observation (in each column), each value have the minimum of the column subtracted, then the value is divided by the maximum of the column -> to compare the expression of the genes within a condition
sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index,
groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10),
standard_scale='obs')
Scaled by var
-> for each gene (in each row), each value have the minimum of the row subtracted, then the value is divided by the maximum of the row -> to compare the expression of the genes across conditions
sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index,
groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10),
standard_scale='var')
Scaled by obs
-> for each observation (in each column), each value have the minimum of the column subtracted, then the value is divided by the maximum of the column -> to compare the expression of the genes within a condition
sc.pl.heatmap(adata_bulk,
var_names = df_ordered.index,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (5,10), standard_scale = 'obs')
Scaled by var
-> for each gene (in each row), each value have the minimum of the row subtracted, then the value is divided by the maximum of the row -> to compare the expression of the genes across conditions
sc.pl.heatmap(adata_bulk,
var_names = df_ordered.index,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (5,10), standard_scale = 'var')
df_ordered = pd.DataFrame(adata_bulk[:,down_regGenes].X[[0, 1, 2]]).T
df_ordered.index = adata_bulk[:,down_regGenes].var_names
df_ordered['Median'] = df_ordered.median(axis = 1)
df_ordered= df_ordered.sort_values(by = 'Median')
Scaled by obs
-> for each observation (in each column), each value have the minimum of the column subtracted, then the value is divided by the maximum of the column -> to compare the expression of the genes within a condition
sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index,
groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10),
standard_scale='obs')
Scaled by var
-> for each gene (in each row), each value have the minimum of the row subtracted, then the value is divided by the maximum of the row -> to compare the expression of the genes across conditions
sc.pl.heatmap(adata[adata.obs.CellTypes.isin(['hiPSC', 'hEGCLC'])], var_names = df_ordered.index,
groupby = 'CellTypes', cmap = 'PuBu', swap_axes = True, figsize = (10,10),
standard_scale='var')
Scaled by obs
-> for each observation (in each column), each value have the minimum of the column subtracted, then the value is divided by the maximum of the column -> to compare the expression of the genes within a condition
sc.pl.heatmap(adata_bulk,
var_names = df_ordered.index,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (5,10), standard_scale = 'obs')
Scaled by var
-> for each gene (in each row), each value have the minimum of the row subtracted, then the value is divided by the maximum of the row -> to compare the expression of the genes across conditions
sc.pl.heatmap(adata_bulk,
var_names = df_ordered.index,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (5,10), standard_scale = 'var')
print(f'The number of all filtered DEGs in hEGCLC vs hiPSC is {len(deg_table_filtered)}')
print(f'The number of upregulated genes in hEGCLC vs hiPSC is {len(up_regGenes)}')
print(f'The number of downregulated genes in hEGCLC vs hiPSC is {len(down_regGenes)}')
The number of all filtered DEGs in hEGCLC vs hiPSC is 116 The number of upregulated genes in hEGCLC vs hiPSC is 82 The number of downregulated genes in hEGCLC vs hiPSC is 34
%%R -i allUpSelected -i allGenes -i up_regGenes
#allGenes_v <- c(allUpSelected)
allGenes_v <- factor(as.integer(allGenes %in% up_regGenes))
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
selection <- function(allScores){return (as.logical(allScores))}
params <- list(GoEnTh = 2, GoPvalTh = 0.01)
len(up_regGenes)
82
up_regGenes[0:10]
['ZNF560', 'TAF9B', 'ZNF662', 'CRISP2', 'NKAPL', 'PTPN20', 'CRYAB', 'HSPB2', 'CTSF', 'CYP4F22']
%%R
print(allGenes_v[0:10])
ZNF560 TAF9B ZNF662 CRISP2 TWIST2 NKAPL PTPN20 HIST1H1D 1 1 1 1 0 1 1 0 CRYAB HSPB2 1 1 Levels: 0 1
::: {.panel-tabset}
%%R -o ResBPAll -o ResSel
ResBPAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_BP, ontology='BP',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, maxAnn = 500)
ResSel <- ResBPAll$ResSel
%%R
ResBPAll$GOdata
------------------------- topGOdata object ------------------------- Description: - BP allGenes_v Ontology: - BP 13921 available genes (all genes from the array): - symbol: ZNF560 TAF9B ZNF662 CRISP2 TWIST2 ... - 82 significant genes. 12478 feasible genes (genes that can be used in the analysis): - symbol: ZNF560 TAF9B ZNF662 TWIST2 NKAPL ... - 71 significant genes. GO graph (nodes with at least 15 genes): - a graph with directed edges - number of nodes = 4938 - number of edges = 10661 ------------------------- topGOdata object -------------------------
len(up_regGenes)
82
up_regGenes[0:10]
['ZNF560', 'TAF9B', 'ZNF662', 'CRISP2', 'NKAPL', 'PTPN20', 'CRYAB', 'HSPB2', 'CTSF', 'CYP4F22']
%%R
print(allGenes_v[0:10])
ZNF560 TAF9B ZNF662 CRISP2 TWIST2 NKAPL PTPN20 HIST1H1D 1 1 1 1 0 1 1 0 CRYAB HSPB2 1 1 Levels: 0 1
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
15
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder, fileName = 'GO_upregulated_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)
#| echo: False
default_width = 8
default_height = 4
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R
image = bubbleplot(ResSel, Ont = 'BP', fillCol = 'red', terms = 5)
ggsave(file="./figures/GO_BP_up_hEGCLC_vs_hiPSC.pdf", plot=image, width=8, height=4)
bubbleplot(ResSel, Ont = 'BP', fillCol = 'red', terms = 5)
#| echo: False
default_width = 12
default_height = 4
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
image = plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
fillCol='red')
ggsave(file="./figures/Genes_in_term_BP_up_hEGCLC_vs_hiPSC.pdf", plot=image, width=12, height=4)
plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=5, ngenes=12,
fillCol='red')
%%R -i tables_folder
saveGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms = 20,
path = './tables/genesInTerm_hiPSC_hEGCLCs_GO_BP_uphEGCLC.csv',
)
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -o ResMFAll -o ResSel
ResMFAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_MF, ontology='MF',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)
ResSel <- ResMFAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
9
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder , fileName = 'GO_upregulated_hEGCLCs_vs_hiPSC_MF.xlsx', save = True)
%%R
bubbleplot(ResSel, Ont = 'MF', fillCol = 'red')
#| echo: False
default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
plotGenesInTerm_v3(ResMFAll$ResSel, ResMFAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
fillCol='red')
%%R
saveGenesInTerm(ResMFAll$ResSel, ResMFAll$GOdata, nterms = 20,
path = './tables/genesInTerm_hiPSC_hEGCLCs_GO_MF_uphEGCLC.csv')
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -o ResCCAll -o ResSel
ResCCAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_CC, ontology='CC',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)
ResSel <- ResCCAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
6
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder, fileName = 'GO_upregulated_hEGCLCs_vs_hiPSC_CC.xlsx', save = True)
%%R
bubbleplot(ResSel, Ont = 'CC', fillCol = 'red')
#| echo: False
default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
plotGenesInTerm_v3(ResCCAll$ResSel, ResCCAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
fillCol='red')
%%R
saveGenesInTerm(ResCCAll$ResSel, ResCCAll$GOdata, nterms = 20, path = './tables/genesInTerm_hiPSC_hEGCLCs_GO_CC_uphEGCLC.csv')
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
msigdb = dc.get_resource('MSigDB')
msigdb
genesymbol | collection | geneset | |
---|---|---|---|
0 | MAFF | chemical_and_genetic_perturbations | BOYAULT_LIVER_CANCER_SUBCLASS_G56_DN |
1 | MAFF | chemical_and_genetic_perturbations | ELVIDGE_HYPOXIA_UP |
2 | MAFF | chemical_and_genetic_perturbations | NUYTTEN_NIPP1_TARGETS_DN |
3 | MAFF | immunesigdb | GSE17721_POLYIC_VS_GARDIQUIMOD_4H_BMDC_DN |
4 | MAFF | chemical_and_genetic_perturbations | SCHAEFFER_PROSTATE_DEVELOPMENT_12HR_UP |
... | ... | ... | ... |
3838543 | PRAMEF22 | go_biological_process | GOBP_POSITIVE_REGULATION_OF_CELL_POPULATION_PR... |
3838544 | PRAMEF22 | go_biological_process | GOBP_APOPTOTIC_PROCESS |
3838545 | PRAMEF22 | go_biological_process | GOBP_REGULATION_OF_CELL_DEATH |
3838546 | PRAMEF22 | go_biological_process | GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS |
3838547 | PRAMEF22 | go_biological_process | GOBP_NEGATIVE_REGULATION_OF_CELL_DEATH |
3838548 rows × 3 columns
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
rank = pd.DataFrame(deg_table_filtered.logFC)
rank = rank[rank.logFC > 2]
rank_copy = rank.copy()
rank_copy['pval'] = deg_table_filtered.loc[rank.index].FDR
rank_copy
logFC | pval | |
---|---|---|
ZNF560 | 12.498048 | 1.275713e-42 |
TAF9B | 5.954427 | 9.861447e-31 |
ZNF662 | 6.313032 | 1.226581e-29 |
CRISP2 | 10.095322 | 1.798185e-27 |
NKAPL | 9.875911 | 2.885773e-26 |
... | ... | ... |
CALHM5 | 2.473018 | 5.583475e-03 |
SCARA5 | 2.112675 | 7.137824e-03 |
MYOF | 2.052413 | 7.162281e-03 |
TM4SF1 | 2.879223 | 8.117776e-03 |
SCIN | 2.122390 | 9.619166e-03 |
82 rows × 2 columns
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
intTable(results_table_py, folder = tables_folder , fileName = 'GO_up_hEGCLCs_vs_hiPSC_reactome.xlsx', save = True)
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
intTable(results_table_py, folder = tables_folder , fileName = 'GO_up_hEGCLCs_vs_hiPSC_kegg.xlsx', save = True)
curated = msigdb[msigdb['collection'].isin(['wikipathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
intTable(results_table_py, folder = tables_folder , fileName = 'GO_up_hEGCLCs_vs_hiPSC_wikipathways.xlsx', save = True)
:::
%%R -i allDownSelected -i allGenes
allGenes_v <- c(allDownSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
allGenes_v <- as.factor(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
::: {.panel-tabset}
%%R -o ResBPAll -o ResSel
ResBPAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_BP, ontology='BP',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='BPAll', maxAnn = 500)
ResSel <- ResBPAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
15
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder , fileName = 'GO_downregulated_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)
%%R
bubbleplot(ResSel, Ont = 'BP', fillCol = 'blue')
#| echo: False
default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
fillCol='blue')
%%R
saveGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_BP_upEG.csv')
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -o ResMFAll -o ResSel
ResMFAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_MF, ontology='MF',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)
ResSel <- ResMFAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
2
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder , fileName = 'GO_downregulated_hEGCLCs_vs_hiPSC_MF.xlsx', save = True)
%%R
bubbleplot(ResSel, Ont = 'MF', fillCol = 'blue')
#| echo: False
default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
plotGenesInTerm_v3(ResMFAll$ResSel, ResMFAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
fillCol='blue')
%%R
saveGenesInTerm(ResSel, ResMFAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_MF_upEG.csv',
)
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -o ResCCAll -o ResSel
ResCCAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_CC, ontology='CC',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)
ResSel <- ResCCAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
0
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder , fileName = 'GO_downregulated_hEGCLCs_vs_hiPSC_CC.xlsx', save = True)
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
rank = pd.DataFrame(deg_table_filtered.logFC)
rank = rank[rank.logFC < 0]
rank_copy = rank.copy()
rank_copy['pval'] = deg_table_filtered.loc[rank.index].FDR
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
intTable(results_table_py, folder = tables_folder , fileName = 'GO_down_hEGCLCs_vs_hiPSC_reactome.xlsx', save = True)
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
results_table_py
No significant term was found
#results_table_py = getAnnGenes(results = results_table_py, GO2gene = GO2gene['kegg_pathways'], DEGs = rank_copy)
#intTable(results_table_py, folder = tables_folder , fileName = 'GO_down_hEGCLCs_vs_hiPSC_kegg.xlsx', save = True)
#results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
#_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_down)
#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_kegg_downhEGCLC.csv.xlsx', save = True)
curated = msigdb[msigdb['collection'].isin(['wikipathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
#results_table_py = getAnnGenes(results = results_table_py, GO2gene = GO2gene, DEGs = rank_copy)
No significant term was found
#intTable(results_table_py, folder = tables_folder , fileName = 'GO_down_hEGCLCs_vs_hiPSC_wikipathways.xlsx', save = True)
#results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
#_, df = plotGenesInTerm(results_table_py, GO2gene, rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_down)
#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_wikipathways_downhEGCLC.csv.xlsx', save = True)
:::
%%R -i allSelected -i allGenes
allGenes_v <- c(allSelected)
#print(allGenes_v)
names(allGenes_v) <- allGenes
allGenes_v <- unlist(allGenes_v)
allGenes_v <- as.factor(allGenes_v)
geneNames <- c(allGenes)
ann_org_BP <- topGO::annFUN.org(whichOnto='BP', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
ann_org_MF <- topGO::annFUN.org(whichOnto='MF', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
ann_org_CC <- topGO::annFUN.org(whichOnto='CC', feasibleGenes=names(allGenes_v),
mapping='org.Hs.eg', ID='symbol')
selection <- function(allScores){return (as.logical(allScores))}
::: {.panel-tabset}
%%R -o ResBPAll -o ResSel
ResBPAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_BP, ontology='BP',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='BPAll', maxAnn = 500)
ResSel <- ResBPAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
15
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)
%%R
bubbleplot(ResSel, Ont = 'BP', fillCol = 'forestgreen')
#| echo: False
default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
plotGenesInTerm_v3(ResBPAll$ResSel, ResBPAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
fillCol='forestgreen')
%%R
saveGenesInTerm(ResBPAll$ResSel, ResBPAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_BP_allSignificant.csv', SE = deg_table_filtered)
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -o ResMFAll -o ResSel
ResMFAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_MF, ontology='MF',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='MFAll', maxAnn = 500)
ResSel <- ResMFAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
8
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_MF.xlsx', save = True)
%%R
bubbleplot(ResSel, Ont = 'MF', fillCol = 'forestgreen')
#| echo: False
default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
plotGenesInTerm_v3(ResSel, ResMFAll$GOdata, DEGs = deg_table_filtered, nterms=12, ngenes=12,
fillCol='forestgreen')
%%R
saveGenesInTerm(ResSel, ResMFAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_MF_allSignificant.csv')
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -o ResCCAll -o ResSel
ResCCAll <- topGOResults_new(Genes=allGenes_v, GO2genes=ann_org_CC, ontology='CC',
desc=NULL, nodeSize=15, algorithm='weight01', statistic='fisher',
EnTh=params$GoEnTh, PvalTh=params$GoPvalTh, minTerms=15, geneTh=4,
saveRes=FALSE, outDir=paste0(OutputFolder), fileName='CCAll', maxAnn = 500)
ResSel <- ResCCAll$ResSel
results_table_py = ro.conversion.rpy2py(ResSel)
results_table_py['Statistics'] = results_table_py['Statistics'].astype('float64')
len(results_table_py)
10
results_table_py['-log10(pvalue)'] = - np.log10(results_table_py.Statistics)
results_table_py['Significant/Annotated'] = results_table_py['Significant'] / results_table_py['Annotated']
intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_BP.xlsx', save = True)
%%R
bubbleplot(ResSel, Ont = 'CC', fillCol = 'forestgreen')
#| echo: False
default_width = 20
default_height = 7
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
%%R -i deg_table_filtered
plotGenesInTerm_v3(ResCCAll$ResSel, ResCCAll$GOdata, DEGs = deg_table_filtered, nterms=15, ngenes=12,
fillCol='forestgreen')
%%R
saveGenesInTerm(ResCCAll$ResSel, ResCCAll$GOdata, nterms = 20, path = './tables/genesInTerm_iPSC_hEGCLCs_GO_CC_allSignificant.csv', SE = deg_table_filtered)
#| echo: False
default_width = 15
default_height = 9
rpy2.ipython.rmagic.RMagics.setup_graphics = new_setup_graphics
curated = msigdb[msigdb['collection'].isin(['reactome_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
rank = pd.DataFrame(deg_table_filtered.logFC)
rank_copy = rank.copy()
rank_copy['pval'] = deg_table_filtered.loc[rank.index].FDR
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
intTable(results_table_py, folder = tables_folder , fileName = 'GO_all_hEGCLCs_vs_hiPSC_reactome.xlsx', save = True)
#_, df = plotGenesInTerm(results_table_py, GO2gene['reactome_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)
#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_Reactome_allSignificant.xlsx', save = True)
curated = msigdb[msigdb['collection'].isin(['kegg_pathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
No significant term was found
intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_kegg.xlsx', save = True)
#_, df = plotGenesInTerm(results_table_py, GO2gene['kegg_pathways'], rank_copy, n_top_terms = 10, n_top_genes = 15, cmap = cmap_all)
#intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_kegg_allSignificant.csv.xlsx', save = True)
curated = msigdb[msigdb['collection'].isin(['wikipathways'])]
curated = curated[~curated.duplicated(['geneset', 'genesymbol'])]
aggregated = curated[["geneset", "genesymbol"]].groupby("geneset").count().rename(columns={"genesymbol": "gene_count"})
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count > 500].index.tolist())].copy()
curated = curated[~curated.geneset.isin(aggregated[aggregated.gene_count < 15].index.tolist())].copy()
results_table_py = run_ora_catchErrors(mat=rank.T, net=curated, source='geneset', target='genesymbol', verbose=False, n_up=len(rank), n_bottom=0)
results_table_py = getAnnGenes(results = results_table_py, GO2gene = GO2gene, DEGs = rank_copy)
results_table_py = results_table_py[results_table_py['pvals'] < 0.05]
results_table_py = results_table_py[results_table_py['n_gene_annotated'] < 500]
results_table_py = results_table_py[results_table_py['n_gene_annotated'] > 15]
results_table_py = results_table_py[results_table_py['n_gene_significant'] > 4]
intTable(results_table_py, folder = tables_folder , fileName = 'GO_allSignificant_hEGCLCs_vs_hiPSC_wikipathways.xlsx', save = True)
_, df = plotGenesInTerm(results_table_py, GO2gene, rank_copy, n_top_terms = 1, n_top_genes = 15, cmap = cmap_all)
intTable(df, folder = tables_folder , fileName = 'genesInTerm_hiPSC_hEGCLCs_wikipathways_allSignificant.csv.xlsx', save = True)
:::