import pandas as pd
import numpy as np
np.random.seed(0)
import sklearn
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
# %matplotlib inline
import os
import time
from IPython.core.interactiveshell import InteractiveShell
import scanpy as sc
from collections import Counter
from scipy.stats import spearmanr
from itertools import chain
import celloracle as co
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object
co.__version__
InteractiveShell.ast_node_interactivity = "all"
import gc
gc.collect()
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, dpi_save = 300, transparent = 'False', color_map = 'viridis')
sns.set_style("whitegrid")#, {"axes.facecolor": "0.93"})
sns.set_palette('Spectral')
sns.set_context('talk')
# visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300
os.chdir('')
save_folder = "figures"
os.makedirs(save_folder, exist_ok=True)
earthy_pal_ordered = ["#2C5241","#528670","#81b29a","#a0c9e2","#789FBD",
"#5f7893","#3d405b","#634A5B","#88535a","#d26559",
"#ef8264","#F1A77A","#f2cc8f","#C8A6A3"]
sns.color_palette(earthy_pal_ordered)
earthy_pal = ["#ef8264", "#3d405b","#81b29a","#f2cc8f","#789FBD",
"#88535a","#C8A6A3","#F1A77A","#528670","#a0c9e2",
"#BB6157","#5f7893","#634A5B","#2C5241"]
sns.color_palette(earthy_pal)
# Works
# scanpy==1.10.1 anndata==0.10.7 umap==0.5.6 numpy==1.26.4 scipy==1.13.0 pandas==1.5.3 scikit-learn==1.4.2 statsmodels==0.14.2 igraph==0.11.4 louvain==0.8.2 pynndescent==0.5.12
Matplotlib is building the font cache; this may take a moment. 2024-06-24 10:50:24.674861: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-06-24 10:51:06.933919: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-06-24 10:51:33.638026: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
scanpy==1.10.1 anndata==0.10.7 umap==0.5.6 numpy==1.26.4 scipy==1.13.0 pandas==1.5.3 scikit-learn==1.4.2 statsmodels==0.14.2 igraph==0.11.4 louvain==0.8.2 pynndescent==0.5.12
import os
homeDir = os.getenv("HOME")
import yaml
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
with open(homeDir+"/utils/rcParams.yaml") as f:
rcParamsDict = yaml.full_load(f)
for k in rcParamsDict["rcParams"]:
print("{} {}".format(k,rcParamsDict["rcParams"][k]))
plt.rcParams[k] = rcParamsDict["rcParams"][k]
for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
print("{} {}".format(k1,rcParamsDict[k1]))
import matplotlib.pyplot as plt
scanpy==1.10.1 anndata==0.10.7 umap==0.5.6 numpy==1.26.4 scipy==1.13.0 pandas==1.5.3 scikit-learn==1.4.2 statsmodels==0.14.2 igraph==0.11.4 louvain==0.8.2 pynndescent==0.5.12 figure.dpi 100 savefig.dpi 300 figure.figsize [4, 4] axes.facecolor None figure.facecolor None font.size 10 image.cmap viridis dotSize 10
outdir = homeDir+"CellOracle"
figdir = "./figures"
resultsdir = "./results"
if not os.path.exists(outdir):
# Create a new directory because it does not exist
os.makedirs(outdir)
if not os.path.exists(outdir+"/h5ad"):
# Create a new directory because it does not exist
os.makedirs(outdir+"/h5ad")
with open(homeDir+"/utils/colorsMap.yaml", 'r') as f:
colorsMap = yaml.load(f, Loader=yaml.FullLoader)
if not os.path.exists(figdir):
# Create a new directory because it does not exist
os.makedirs(figdir)
if not os.path.exists(resultsdir):
# Create a new directory because it does not exist
os.makedirs(resultsdir)
expr_csv = pd.read_csv('2024_05_06_revised.CellTypes.expressed.csv')
sarah_genes_new = pd.read_excel('add_genes_to_GRN.xlsx')
sarah_genes_new_list = sarah_genes_new['Genes'].tolist()
with open('SarahGenes.txt', 'r') as f:
lines_sarah = set(f.readlines())
sarah_genes = pd.DataFrame(lines_sarah)
gene_extend = pd.concat([expr_csv, sarah_genes])
gene_extend_new = pd.concat([gene_extend, sarah_genes_new])
len(gene_extend_new)
14777
adata = sc.read_h5ad('PostPseudoTimeFull.h5ad')
print(f"Cell number is :{adata.shape[0]}")
print(f"Gene number is :{adata.shape[1]}")
# Base GRN
base_GRN = pd.read_parquet("20240124_tfi.celloracle.parquet", engine='pyarrow')
base_GRN.head()
# Check data in anndata
print("Metadata columns :", list(adata.obs.columns))
print("Dimensional reduction: ", list(adata.obsm.keys()))
#OPT : extract hvg
filter_result = sc.pp.filter_genes_dispersion(adata.X, # compute hvgs
flavor='cell_ranger',
n_top_genes=2500,
log=True)
hvgs = list(adata.var_names[filter_result.gene_subset])
# add wanted genes!
hvgs.extend(lines_sarah)
hvgs.extend(sarah_genes_new_list)
hvgs.extend(adata.var[adata.var.highly_variable].index.tolist())
hvgs = pd.Series(hvgs).unique()
len(hvgs)
hvgs = list(set(hvgs).intersection(set(adata.var_names)))
len(hvgs)
adata_raw = adata.raw.to_adata()
#### Subset the genes
adata_raw = adata_raw[:, hvgs].copy()
adata_raw.var_names
gc.collect()
Cell number is :19638 Gene number is :14582
peak_id | gene_short_name | 9430076C15RIK | AC002126.6 | AC012531.1 | AC226150.2 | AFP | AHR | AHRR | AIRE | ... | ZNF784 | ZNF8 | ZNF816 | ZNF85 | ZSCAN10 | ZSCAN16 | ZSCAN22 | ZSCAN26 | ZSCAN31 | ZSCAN4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr10_100027007_100029007 | AVPI1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | chr10_100027007_100029007 | MORN4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | chr10_100027007_100029007 | PI4K2A | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | chr10_100170634_100172634 | HPS1 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | chr10_100170634_100172634 | HPSE2 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 1096 columns
Metadata columns : ['sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'CellTypes', 'CellTypes_v2', 'S_score', 'G2M_score', 'phase', 'main', 'Pseudotime_main', 'Pseudotime'] Dimensional reduction: ['X_diffmap', 'X_draw_graph_fr', 'X_pca', 'X_pca_harmony', 'X_umap'] extracting highly variable genes finished (0:00:01)
3436
3280
Index(['PYROXD2', 'PAK4', 'PFN2', 'ZNF385D', 'ITGB4', 'ATM', 'C1GALT1C1', 'GUCA1B', 'TMEM51', 'GLYR1', ... 'SUSD2', 'CCNB1', 'CRHBP', 'F2', 'RBMX2', 'PGM5', 'LIPT2', 'CACNA2D2', 'MT2A', 'PPIC'], dtype='object', length=3280)
994
all_dict = {"housekeeping" : ['PSMB2', 'GPI', 'ACTB'],
"pluripotency" : ["SOX2", "KLF4", "KLF5", "SALL4", "LIN28A"],
"proliferation" : ["MKI67"],
"naive" : ["DPPA5", "TFCP2L1", "SUSD5"],
"naive_methylation" : ["DNMT3L"],
"primed_hESC" : ["DUSP6","FAT3","THY1","STC1","KLHL4","ZDHHC22","NEFM","HMX2","PTPRZ1","CYTL1"],
"ICM" : ['ZFP42', 'PRDM14', 'KLF2', 'TCL1B', 'ESRRB'],
"hiPSC_iMeLCs" : ["SOX2", "ZIC2", "ZIC5", "OTX2", "EPHA2", "TUBB2A", "TCF7L1", "SFRP2", "FZD2", "LHX1", 'MESP1', 'MESP2', 'CDH1', 'FGF8', 'FGF2'],
"mesodermal" : [ "HOXB3", "HOXB5", "HOXB6", "BMP4", "CDX2", "HAND1", "MSX1", "TBX3", "GATA2", "GATA3", "GATA6", "ISL1", "PITX2", "TBXT", "MIXL1", "GATA6", "FOXF1", "MESP2", "EOMES", "RUNX2", "EVX1", "SP5", "MSX2", "NODAL", 'LEF1', 'SNAI1', 'SNAI2'],
"PGC" : ["SOX17", "TFAP2C", "NANOS3", "KIT", "ALPL", "SOX15", "PRDM1", "WNT3", "DPPA3", "PRDM14", "TBXT", "KLF4", "TCL1A", "TFCP2L1", "PDPN", "CD38", "ITGA6", "EPCAM"],
"PGC_late" : ['DAZL', "TDRD6", "PIWIL2"],
"methylation" : ['DNMT1', 'DNMT3A', 'DNMT3B', 'PRMT5', 'UHRF1', 'TET1', 'TET2'],
"endoderm" : ["SOX17", "PRDM1", 'FOXA1', "FOXA2"],
"ectoderm" : ['DLX5', 'TFAP2A', 'GATA3'],
"trophoblast_trophoectoderm" : ["TEAD4", "TFAP2C", "GATA3", "GATA2", 'ITGA6', 'ITGA5', "KRT7", "ITGB6", "EPAS1", "TACSTD2", "KLF6", "VGLL1", 'KCNQ2'],
"amnnion" : ['TFAP2A', 'GATA3', 'CDX2', "BMP4"],
"endothelial_progenitors" : ['CD34', "CDH5", "NRP2", "EGFL7", "KDR", "NT5E", "APLNR"],
"early_neural" : ['SOX3']}
sc.tl.dendrogram(adata,'leiden')
sc.pl.dotplot(adata, all_dict , 'leiden', dendrogram=True,save="2024_05_09_markersDotplot_leiden",use_raw=False,log=True,standard_scale='var')
sc.tl.dendrogram(adata,'CellTypes')
sc.pl.dotplot(adata, all_dict , 'CellTypes', dendrogram=True,save="2024_05_09_markersDotplot_CellTypes",use_raw=False,log=True,standard_scale='var')
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_leiden']` WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: 0, 1, 3, etc. var_group_labels: housekeeping, pluripotency, proliferation, etc. WARNING: saving figure to file figures/dotplot_2024_05_09_markersDotplot_leiden.pdf
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_CellTypes']` WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: hEGCLC, hPGCLC, hiPSC, etc. var_group_labels: housekeeping, pluripotency, proliferation, etc. WARNING: saving figure to file figures/dotplot_2024_05_09_markersDotplot_CellTypes.pdf
all_dict_1 = {
"pluripotency" : ["SOX2", "KLF4", "KLF5", "SALL4", "LIN28A"],
"proliferation" : ["MKI67"],
"mesodermal" : [ "HOXB3", "HOXB5", "HOXB6", "BMP4", "CDX2", "HAND1", "MSX1", "TBX3", "GATA2", "GATA3", "GATA6", "ISL1", "PITX2", "TBXT", "MIXL1", "GATA6", "FOXF1", "MESP2", "EOMES", "RUNX2", "EVX1", "SP5", "MSX2", "NODAL", 'LEF1', 'SNAI1', 'SNAI2'],
"PGC" : ["SOX17", "TFAP2C", "NANOS3", "KIT", "ALPL", "SOX15", "PRDM1", "WNT3", "DPPA3", "PRDM14", "TBXT", "KLF4", "TCL1A", "TFCP2L1", "PDPN", "CD38", "ITGA6", "EPCAM"],
"PGC_late" : ['DAZL', "TDRD6", "PIWIL2"],
}
sc.tl.dendrogram(adata,'leiden')
sc.pl.dotplot(adata, all_dict_1 , 'leiden', dendrogram=True,save="2024_05_09_markersDotplot_leiden",use_raw=False,log=True,standard_scale='var')
sc.tl.dendrogram(adata,'CellTypes')
sc.pl.dotplot(adata, all_dict_1 , 'CellTypes', dendrogram=True,save="2024_05_09_markersDotplot_CellTypes",use_raw=False,log=True,standard_scale='var')
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_leiden']` WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: 0, 1, 3, etc. var_group_labels: pluripotency, proliferation, mesodermal, etc. WARNING: saving figure to file figures/dotplot_2024_05_09_markersDotplot_leiden.pdf
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_CellTypes']` WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: hEGCLC, hPGCLC, hiPSC, etc. var_group_labels: pluripotency, proliferation, mesodermal, etc. WARNING: saving figure to file figures/dotplot_2024_05_09_markersDotplot_CellTypes.pdf
all_dict_2 = {
"ID" : ["ID1", "ID2", "ID3", "ID4"],
"SOX" : ["SOX2", "SOX3", "SOX6", "SOX7", "SOX14", "SOX15", "SOX17", "SOX18", "SOX21"]
}
sc.tl.dendrogram(adata,'CellTypes')
sc.pl.dotplot(adata, all_dict_2 , 'CellTypes', dendrogram=True,use_raw=False,log=True,standard_scale='var')
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_CellTypes']` WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: hEGCLC, hPGCLC, hiPSC, etc. var_group_labels: ID, SOX
oracle = co.Oracle()
# In this notebook, we use the unscaled mRNA count for the input of Oracle object.
oracle.import_anndata_as_raw_count(adata=adata_raw,
cluster_column_name='CellTypes',
embedding_name="X_draw_graph_fr")
#oracle.import_anndata_as_normalized_count wants unscaled and uncentered counts
oracle.import_TF_data(TF_info_matrix=base_GRN)
oracle
WARNING: adata.X seems to be already log-transformed.
Oracle object Meta data celloracle version used for instantiation: 0.18.0 n_cells: 19638 n_genes: 3280 cluster_name: CellTypes dimensional_reduction_name: X_draw_graph_fr n_target_genes_in_TFdict: 21595 genes n_regulatory_in_TFdict: 1094 genes n_regulatory_in_both_TFdict_and_scRNA-seq: 255 genes n_target_genes_both_TFdict_and_scRNA-seq: 2897 genes k_for_knn_imputation: NA Status Gene expression matrix: Ready BaseGRN: Ready PCA calculation: Not finished Knn imputation: Not finished GRN calculation for simulation: Not finished
# Perform PCA
oracle.perform_PCA()
# Select important PCs
plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100], color = earthy_pal[2])
n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
plt.axvline(n_comps, c="k")
plt.show()
print(n_comps)
n_comps = min(n_comps, 50)
n_cell = oracle.adata.shape[0]
print(f"cell number is :{n_cell}")
k = int(0.025*n_cell)
print(f"Auto-selected k is :{k}")
oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,
b_maxl=k*4, n_jobs=8)
[<matplotlib.lines.Line2D at 0x1551f865f370>]
<matplotlib.lines.Line2D at 0x15521aa94a00>
16 cell number is :19638 Auto-selected k is :490
oracle.to_hdf5("2024_05_09_PGCLC_CellTypes.celloracle.oracle")
oracle
# Calculate GRN for each cluster.
# This step may take some time.(~30 minutes) white
links = oracle.get_links(cluster_name_for_GRN_unit="CellTypes", alpha=10,
verbose_level=10, )
links.links_dict.keys()
Oracle object Meta data celloracle version used for instantiation: 0.18.0 n_cells: 19638 n_genes: 3280 cluster_name: CellTypes dimensional_reduction_name: X_draw_graph_fr n_target_genes_in_TFdict: 21595 genes n_regulatory_in_TFdict: 1094 genes n_regulatory_in_both_TFdict_and_scRNA-seq: 255 genes n_target_genes_both_TFdict_and_scRNA-seq: 2897 genes k_for_knn_imputation: 490 Status Gene expression matrix: Ready BaseGRN: Ready PCA calculation: Done Knn imputation: Done GRN calculation for simulation: Not finished
0%| | 0/4 [00:00<?, ?it/s]
Inferring GRN for hEGCLC...
0%| | 0/2897 [00:00<?, ?it/s]
Inferring GRN for hPGCLC...
0%| | 0/2897 [00:00<?, ?it/s]
Inferring GRN for hiPSC...
0%| | 0/2897 [00:00<?, ?it/s]
Inferring GRN for iMeLC...
0%| | 0/2897 [00:00<?, ?it/s]
dict_keys(['hEGCLC', 'hPGCLC', 'hiPSC', 'iMeLC'])
links.links_dict.keys()
dict_keys(['hEGCLC', 'hPGCLC', 'hiPSC', 'iMeLC'])
links.to_hdf5(file_path="2024_05_09_PGCLC_CellTypes_1.celloracle.links")
links.links_dict["hEGCLC"]
# Set cluster name
cluster = "hEGCLC"
# Save as csv
links.links_dict[cluster].to_csv("GRN_hEGCLC.csv")
links.links_dict["hPGCLC"]
# Set cluster name
cluster = "hPGCLC"
# Save as csv
links.links_dict[cluster].to_csv("GRN_hPGCLC.csv")
links.links_dict["hiPSC"]
# Set cluster name
cluster = "hiPSC"
# Save as csv
links.links_dict[cluster].to_csv("GRN_hiPSC.csv")
links.links_dict["iMeLC"]
# Set cluster name
cluster = "iMeLC"
# Save as csv
links.links_dict[cluster].to_csv("GRN_iMeLC.csv")
links.links_dict.keys()
links.to_hdf5(file_path="2024_05_09_PGCLC_CellTypes_2.celloracle.links")
source | target | coef_mean | coef_abs | p | -logp | |
---|---|---|---|---|---|---|
0 | E2F2 | A4GALT | -0.000139 | 0.000139 | 8.259854e-01 | 0.083028 |
1 | TFAP2B | A4GALT | 0.000000 | 0.000000 | NaN | -0.000000 |
2 | TBX2 | A4GALT | 0.002105 | 0.002105 | 7.263495e-04 | 3.138854 |
3 | MAF | A4GALT | -0.002493 | 0.002493 | 9.886276e-03 | 2.004967 |
4 | E2F1 | A4GALT | 0.012186 | 0.012186 | 2.743329e-10 | 9.561722 |
... | ... | ... | ... | ... | ... | ... |
350471 | SP3 | ZYG11A | -0.004039 | 0.004039 | 9.052903e-09 | 8.043212 |
350472 | BCL3 | ZYG11A | 0.004329 | 0.004329 | 4.223706e-07 | 6.374306 |
350473 | TP63 | ZYG11A | 0.000412 | 0.000412 | 4.806146e-06 | 5.318203 |
350474 | ZNF816 | ZYG11A | 0.000919 | 0.000919 | 5.322033e-04 | 3.273922 |
350475 | TCF21 | ZYG11A | -0.000537 | 0.000537 | 1.507636e-06 | 5.821704 |
350476 rows × 6 columns
source | target | coef_mean | coef_abs | p | -logp | |
---|---|---|---|---|---|---|
0 | E2F2 | A4GALT | -0.001583 | 0.001583 | 8.030973e-02 | 1.095232 |
1 | TFAP2B | A4GALT | -0.004802 | 0.004802 | 3.048033e-08 | 7.515980 |
2 | TBX2 | A4GALT | 0.000057 | 0.000057 | 7.949898e-01 | 0.099638 |
3 | MAF | A4GALT | -0.003483 | 0.003483 | 2.581541e-04 | 3.588121 |
4 | E2F1 | A4GALT | -0.011982 | 0.011982 | 1.966049e-09 | 8.706406 |
... | ... | ... | ... | ... | ... | ... |
350471 | SP3 | ZYG11A | 0.008460 | 0.008460 | 9.469434e-16 | 15.023676 |
350472 | BCL3 | ZYG11A | 0.004711 | 0.004711 | 9.831462e-06 | 5.007382 |
350473 | TP63 | ZYG11A | 0.000055 | 0.000055 | 7.759801e-01 | 0.110149 |
350474 | ZNF816 | ZYG11A | 0.010861 | 0.010861 | 1.269887e-14 | 13.896235 |
350475 | TCF21 | ZYG11A | 0.000069 | 0.000069 | 6.963938e-01 | 0.157145 |
350476 rows × 6 columns
source | target | coef_mean | coef_abs | p | -logp | |
---|---|---|---|---|---|---|
0 | E2F2 | A4GALT | 0.001962 | 0.001962 | 7.726034e-03 | 2.112043 |
1 | TFAP2B | A4GALT | 0.000000 | 0.000000 | NaN | -0.000000 |
2 | TBX2 | A4GALT | 0.000143 | 0.000143 | 6.566184e-01 | 0.182687 |
3 | MAF | A4GALT | -0.006736 | 0.006736 | 1.348427e-10 | 9.870173 |
4 | E2F1 | A4GALT | 0.006165 | 0.006165 | 1.770085e-10 | 9.752006 |
... | ... | ... | ... | ... | ... | ... |
350471 | SP3 | ZYG11A | -0.007594 | 0.007594 | 3.454300e-09 | 8.461640 |
350472 | BCL3 | ZYG11A | 0.016571 | 0.016571 | 5.683222e-15 | 14.245405 |
350473 | TP63 | ZYG11A | -0.000181 | 0.000181 | 4.617308e-01 | 0.335611 |
350474 | ZNF816 | ZYG11A | -0.003701 | 0.003701 | 3.364209e-07 | 6.473117 |
350475 | TCF21 | ZYG11A | 0.000705 | 0.000705 | 7.361616e-04 | 3.133027 |
350476 rows × 6 columns
source | target | coef_mean | coef_abs | p | -logp | |
---|---|---|---|---|---|---|
0 | E2F2 | A4GALT | 0.005337 | 0.005337 | 6.968550e-08 | 7.156858 |
1 | TFAP2B | A4GALT | 0.000581 | 0.000581 | 7.770968e-02 | 1.109525 |
2 | TBX2 | A4GALT | 0.001598 | 0.001598 | 5.916519e-05 | 4.227934 |
3 | MAF | A4GALT | 0.010437 | 0.010437 | 9.697370e-13 | 12.013346 |
4 | E2F1 | A4GALT | 0.009388 | 0.009388 | 4.290203e-08 | 7.367522 |
... | ... | ... | ... | ... | ... | ... |
350471 | SP3 | ZYG11A | 0.004214 | 0.004214 | 1.057876e-04 | 3.975565 |
350472 | BCL3 | ZYG11A | 0.003921 | 0.003921 | 8.687331e-04 | 3.061114 |
350473 | TP63 | ZYG11A | 0.004672 | 0.004672 | 6.817922e-13 | 12.166348 |
350474 | ZNF816 | ZYG11A | -0.003302 | 0.003302 | 7.424926e-08 | 7.129308 |
350475 | TCF21 | ZYG11A | 0.001390 | 0.001390 | 2.354117e-06 | 5.628172 |
350476 rows × 6 columns
dict_keys(['hEGCLC', 'hPGCLC', 'hiPSC', 'iMeLC'])
links = co.load_hdf5(file_path="2024_05_09_PGCLC_CellTypes_2.celloracle.links")
# filter by pvalue
# filter the top 3000 edges by strength
links.filter_links(p=0.001, weight="coef_abs", threshold_number=3000)
sns.set_context("notebook")
plt.rcParams["figure.figsize"] = [9, 4.5]
links.plot_degree_distributions(plot_model=True,
save=f"{save_folder}/degree_distribution/",
)
# Calculate network scores.
hEGCLC
hPGCLC
hiPSC
iMeLC
links.get_network_score()
links.merged_score.head()
# Save Links object.
links.to_hdf5(file_path="2024_05_09_PGCLC_CellTypes_merged.celloracle.links")
degree_all | degree_centrality_all | degree_in | degree_centrality_in | degree_out | degree_centrality_out | betweenness_centrality | eigenvector_centrality | cluster | |
---|---|---|---|---|---|---|---|---|---|
ZNF165 | 54 | 0.067248 | 1 | 0.001245 | 53 | 0.066002 | 40.0 | 0.208954 | hEGCLC |
HIST1H1C | 51 | 0.063512 | 51 | 0.063512 | 0 | 0.000000 | 0.0 | 0.401413 | hEGCLC |
SOX3 | 29 | 0.036115 | 0 | 0.000000 | 29 | 0.036115 | 0.0 | 0.231489 | hEGCLC |
HIST1H2BH | 49 | 0.061021 | 49 | 0.061021 | 0 | 0.000000 | 0.0 | 0.351283 | hEGCLC |
HDAC2 | 325 | 0.404732 | 3 | 0.003736 | 322 | 0.400996 | 4972.0 | 1.000000 | hEGCLC |
plt.rcParams["figure.figsize"] = [4.5, 8]
sns.set_context("notebook")
# Visualize top n-th genes with high scores.
links.plot_scores_as_rank(cluster="hEGCLC", n_gene=30, save=f"{save_folder}/ranked_score_hEGCLC")
sns.set_context("notebook")
# Visualize top n-th genes with high scores.
links.plot_scores_as_rank(cluster="hPGCLC", n_gene=30, save=f"{save_folder}/ranked_score_hPGCLC")
sns.set_context("notebook")
# Visualize top n-th genes with high scores.
links.plot_scores_as_rank(cluster="hiPSC", n_gene=30, save=f"{save_folder}/ranked_score_hiPSC")
sns.set_context("notebook")
# Visualize top n-th genes with high scores.
links.plot_scores_as_rank(cluster="iMeLC", n_gene=30, save=f"{save_folder}/ranked_score_iMeLC")
plt.rcParams["figure.figsize"] = [8, 4]
# Compare GRN score between two clusters
links.plot_score_comparison_2D(value="eigenvector_centrality",
cluster1="hEGCLC", cluster2="hPGCLC",
percentile=98,
save=f"{save_folder}/score_comparison_hEGCLC_hPGCLC")
plt.rcParams["figure.figsize"] = [8, 4]
# Compare GRN score between two clusters
links.plot_score_comparison_2D(value="eigenvector_centrality",
cluster1="hEGCLC", cluster2="hiPSC",
percentile=98,
save=f"{save_folder}/score_comparison_hEGCLC_hiPSC")
plt.rcParams["figure.figsize"] = [8, 4]
# Compare GRN score between two clusters
links.plot_score_comparison_2D(value="eigenvector_centrality",
cluster1="hEGCLC", cluster2="iMeLC",
percentile=98,
save=f"{save_folder}/score_comparison_hEGCLC_iMeLC")
plt.rcParams["figure.figsize"] = [8, 4]
# Compare GRN score between two clusters
links.plot_score_comparison_2D(value="eigenvector_centrality",
cluster1="hPGCLC", cluster2="hiPSC",
percentile=98,
save=f"{save_folder}/score_comparison_hPGCLC_hiPSC")
plt.rcParams["figure.figsize"] = [8, 4]
# Compare GRN score between two clusters
links.plot_score_comparison_2D(value="eigenvector_centrality",
cluster1="hPGCLC", cluster2="iMeLC",
percentile=98,
save=f"{save_folder}/score_comparison_hPGCLC_iMeLC")
plt.rcParams["figure.figsize"] = [8, 4]
# Compare GRN score between two clusters
links.plot_score_comparison_2D(value="eigenvector_centrality",
cluster1="hiPSC", cluster2="iMeLC",
percentile=98,
save=f"{save_folder}/score_comparison_hiPSC_iMeLC")
links.plot_score_per_cluster(goi="SOX2", save=f"{save_folder}/network_score_per_SOX2/")
links.plot_score_per_cluster(goi="NANOG", save=f"{save_folder}/network_score_per_NANOG/")
links.plot_score_per_cluster(goi="POU5F1", save=f"{save_folder}/network_score_per_POU5F1/")
links.plot_score_per_cluster(goi="EOMES", save=f"{save_folder}/network_score_per_EOMES/")
links.plot_score_per_cluster(goi="SP5", save=f"{save_folder}/network_score_per_SP5/")
links.plot_score_per_cluster(goi="TFAP2C", save=f"{save_folder}/network_score_per_TFAP2C/")
links.plot_score_per_cluster(goi="SOX17", save=f"{save_folder}/network_score_per_SOX17/")
links.plot_score_per_cluster(goi="NANOS3", save=f"{save_folder}/network_score_per_NANOS3/")
links.plot_score_per_cluster(goi="BLIMP1", save=f"{save_folder}/network_score_per_BLIMP1/")
links.plot_score_per_cluster(goi="PIWIL2", save=f"{save_folder}/network_score_per_PIWIL2/")
plt.rcParams["figure.figsize"] = [6, 5]
# Plot degree_centrality
plt.subplots_adjust(left=0.15, bottom=0.3)
plt.ylim([0,0.030])
links.plot_score_discributions(values=["degree_centrality_all"],
method="boxplot",
save=f"{save_folder}",
)
# Plot eigenvector_centrality
plt.subplots_adjust(left=0.15, bottom=0.3)
plt.ylim([0, 0.25])
links.plot_score_discributions(values=["eigenvector_centrality"],
method="boxplot",
save=f"{save_folder}")
plt.subplots_adjust(left=0.15, bottom=0.3)
links.plot_network_entropy_distributions(save=f"{save_folder}")
SOX2
NANOG
POU5F1
EOMES
SP5
TFAP2C
SOX17
NANOS3
BLIMP1
PIWIL2
(0.0, 0.03)
degree_centrality_all
(0.0, 0.25)
eigenvector_centrality
adata.var_names
Index(['SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'HES4', 'ISG15', 'AGRN', 'C1orf159', 'TTLL10', 'TNFRSF18', ... 'MT-ND2', 'MT-CO2', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB'], dtype='object', length=14582)
adata.shape
(19638, 14582)
links.plot_score_per_cluster(goi="WNT7A", save=f"{save_folder}/network_score_per_WNT7A/")
links.plot_score_per_cluster(goi="MSX1", save=f"{save_folder}/network_score_per_MSX1/")
links.plot_score_per_cluster(goi="MSX2", save=f"{save_folder}/network_score_per_MSX2/")
links.plot_score_per_cluster(goi="DNMT3L", save=f"{save_folder}/network_score_per_DNMT3L/")
links.plot_score_per_cluster(goi="GATA3", save=f"{save_folder}/network_score_per_GATA3/")
links.plot_score_per_cluster(goi="GATA2", save=f"{save_folder}/network_score_per_GATA2/")
WNT7A
MSX1
MSX2
DNMT3L
GATA3
GATA2
links.plot_score_per_cluster(goi="PRDM1", save=f"{save_folder}/network_score_per_PRDM1/")
links.plot_score_per_cluster(goi="LSD1", save=f"{save_folder}/network_score_per_LSD1/")
links.plot_score_per_cluster(goi="GTF2I", save=f"{save_folder}/network_score_per_GTF2I/")
links.plot_score_per_cluster(goi="ADNP", save=f"{save_folder}/network_score_per_ADNP/")
PRDM1
LSD1
GTF2I
ADNP