import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import decoupler as dc
import sys
sys.path.append("./../../../utilities_folder/")
from utilities import CleanList, PlotMarkers, computeGeneScores, convert_to_html, link_table, filter_by_expr, plot_box
Set figure parameters:
%matplotlib inline
sc.set_figure_params(color_map='PuBu', transparent=True, dpi = 600)
plt.rcParams['pdf.fonttype'] = 'truetype'
color_palette = {
'iMeLC': '#377eb8',
'hEGCLC': '#ff7f00',
'hPGCLC': '#4daf4a',
'hiPSC': '#dd1c77',
'HEP': '#a65628',
'MLC': '#f781bf',
'EndLC': '#999999',
'AmLC': '#80b1d3',
}
adata = sc.read("./../01_Annotation/1_All_Annotated_Triku.h5ad")
adata
AnnData object with n_obs × n_vars = 27925 × 14582 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes' var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable' uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap' varm: 'PCs' layers: 'raw' obsp: 'connectivities', 'distances'
adata.obs.CellTypes.value_counts()
hEGCLC 7733 MLC 7448 hiPSC 6900 iMeLC 3721 hPGCLC 1284 AmLC 671 EndLC 114 HEP 54 Name: CellTypes, dtype: int64
adata.obs.specimen.value_counts()
hPGCLC aggregate 9571 hEGCLC 7733 hiPSC 6900 iMeLC 3721 Name: specimen, dtype: int64
adata.obs.run_id.value_counts()
G01/ 18340 H01_WA/ 5430 A01/ 4155 Name: run_id, dtype: int64
# Get pseudo-bulk profile
padata = dc.get_pseudobulk(adata, sample_col='sample_id', groups_col='CellTypes', layer='raw', min_prop=0.1, min_smpls=1)
del padata.obs["notes"]
del padata.obs["mosaic"]
del padata.obs["total_counts_ribo"]
del padata.obs["log1p_total_counts_ribo"]
del padata.obs["pct_counts_ribo"]
padata.write("./3_PseudobulkRaw.h5ad")
padata.obs = padata.obs.drop(["genSite", "who", "project"], axis = 1)
padata.obs
sample_id | run_id | probe_barcode_ids | subject | line | specimen | stage | condition | seqRun | original_name | SC_derivation | micoplasma | scDblFinder_class | CellTypes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2EB_AmLC | 2EB | H01_WA/ | BC011 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 2EB | iPSC | NO | singlet | AmLC |
3EB_AmLC | 3EB | H01_WA/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 3EB | iPSC | NO | singlet | AmLC |
5EB_AmLC | 5EB | G01/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | G01 | 5EB | iPSC | NO | singlet | AmLC |
5EB_EndLC | 5EB | G01/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | G01 | 5EB | iPSC | NO | singlet | EndLC |
2EB_HEP | 2EB | H01_WA/ | BC011 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 2EB | iPSC | NO | singlet | HEP |
3EB_HEP | 3EB | H01_WA/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 3EB | iPSC | NO | singlet | HEP |
5EB_HEP | 5EB | G01/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | G01 | 5EB | iPSC | NO | singlet | HEP |
2EB_MLC | 2EB | H01_WA/ | BC011 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 2EB | iPSC | NO | singlet | MLC |
3EB_MLC | 3EB | H01_WA/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 3EB | iPSC | NO | singlet | MLC |
5EB_MLC | 5EB | G01/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | G01 | 5EB | iPSC | NO | singlet | MLC |
EG2_hEGCLC | EG2 | G01/ | BC014 | CTL08 | CTL08A | hEGCLC | day_0 | CTL | G01 | EG2 | iPSC | NO | singlet | hEGCLC |
EG31_hEGCLC | EG31 | G01/ | BC016 | CTL08 | CTL08A | hEGCLC | day_0 | CTL | G01 | EG31 | iPSC | NO | singlet | hEGCLC |
EG9_hEGCLC | EG9 | G01/ | BC015 | CTL08 | CTL08A | hEGCLC | day_0 | CTL | G01 | EG9 | iPSC | NO | singlet | hEGCLC |
2EB_hPGCLC | 2EB | H01_WA/ | BC011 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 2EB | iPSC | NO | singlet | hPGCLC |
3EB_hPGCLC | 3EB | H01_WA/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | H01 | 3EB | iPSC | NO | singlet | hPGCLC |
5EB_hPGCLC | 5EB | G01/ | BC012 | CTL08 | CTL08A | hPGCLC aggregate | day_6 | CTL | G01 | 5EB | iPSC | NO | singlet | hPGCLC |
IEO_13_A_R_hiPSC | IEO_13_A_R | G01/ | BC011 | CTL08 | CTL08A | hiPSC | day_0 | CTL | G01 | IEO.13.A.R | iPSC | NO | singlet | hiPSC |
ht15_FIX_hiPSC | ht15_FIX | A01/ | BC014 | CTL08 | CTL08A | hiPSC | day_0 | CTL | A01 | 15 FIX | iPSC | NO | singlet | hiPSC |
5IM_iMeLC | 5IM | G01/ | BC013 | CTL08 | CTL08A | iMeLC | iMeLCs | CTL | G01 | 5IM | iPSC | NO | singlet | iMeLC |
# Find mito and ribo genes
mito_genes = padata.var_names.str.startswith("MT-")
# qc_vars wants a column of adataata.var containing T/F or 1/0 indicating the genes to be selected for sub-statistics
padata.var["mito"] = padata.var_names.str.startswith("MT-")
# Compute metrics (inplace=True to append to adata)
sc.pp.calculate_qc_metrics(
padata, log1p=True, qc_vars=["mito"], inplace=True, percent_top=None
)
n_cells = adata.obs[['CellTypes', 'sample_id']].value_counts().reset_index()
n_cells
CellTypes | sample_id | 0 | |
---|---|---|---|
0 | hiPSC | ht15_FIX | 4155 |
1 | iMeLC | 5IM | 3721 |
2 | MLC | 5EB | 3504 |
3 | hiPSC | IEO_13_A_R | 2745 |
4 | hEGCLC | EG9 | 2669 |
5 | hEGCLC | EG2 | 2544 |
6 | hEGCLC | EG31 | 2520 |
7 | MLC | 3EB | 2023 |
8 | MLC | 2EB | 1921 |
9 | hPGCLC | 2EB | 867 |
10 | AmLC | 5EB | 291 |
11 | AmLC | 2EB | 270 |
12 | hPGCLC | 5EB | 216 |
13 | hPGCLC | 3EB | 201 |
14 | AmLC | 3EB | 110 |
15 | EndLC | 5EB | 101 |
16 | HEP | 5EB | 29 |
17 | HEP | 2EB | 14 |
18 | HEP | 3EB | 11 |
19 | EndLC | 3EB | 9 |
20 | EndLC | 2EB | 4 |
padata.obs["lib_size"] = np.sum(padata.X, axis=1)
padata.obs["log_lib_size"] = np.log(padata.obs["lib_size"])
for ct in padata.obs.CellTypes.unique():
sub = n_cells[n_cells.CellTypes == ct]
sub.sample_id = sub.sample_id.astype('str')
sns.barplot(y = sub[0], x = sub.sample_id )
plt.title(ct)
plt.show()
genes_to_keep = filter_by_expr(padata, group = 'CellTypes', min_count = 10, min_total_count = 200, min_prop=0)
padata = padata[:, genes_to_keep]
padata.layers['raw'] = padata.X
sc.pp.normalize_total(padata, target_sum = 1e6)
sc.pp.log1p(padata)
obs = padata.obs
padata
AnnData object with n_obs × n_vars = 19 × 12733 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'seqRun', 'original_name', 'SC_derivation', 'micoplasma', 'scDblFinder_class', 'CellTypes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'lib_size', 'log_lib_size' var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts' uns: 'log1p' layers: 'raw'
sc.tl.pca(padata)
sc.pl.pca(padata, color=padata.obs, ncols=2, size=300, components = ['1,2', '2,3'], wspace = 0.4)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /usr/local/lib/python3.8/dist-packages/IPython/core/formatters.py in __call__(self, obj) 339 pass 340 else: --> 341 return printer(obj) 342 # Finally look for special method names 343 method = get_real_method(obj, self.print_method) /usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py in <lambda>(fig) 253 png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs)) 254 if 'retina' in formats or 'png2x' in formats: --> 255 png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs)) 256 if 'jpg' in formats or 'jpeg' in formats: 257 jpg_formatter.for_type(Figure, lambda fig: print_figure(fig, 'jpg', **kwargs)) /usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py in retina_figure(fig, **kwargs) 143 def retina_figure(fig, **kwargs): 144 """format a figure as a pixel-doubled (retina) PNG""" --> 145 pngdata = print_figure(fig, fmt='retina', **kwargs) 146 # Make sure that retina_figure acts just like print_figure and returns 147 # None when the figure is empty. /usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py in print_figure(fig, fmt, bbox_inches, **kwargs) 135 FigureCanvasBase(fig) 136 --> 137 fig.canvas.print_figure(bytes_io, **kw) 138 data = bytes_io.getvalue() 139 if fmt == 'svg': /usr/local/lib/python3.8/dist-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs) 2219 # CL works. "tight" also needs a draw to get the right 2220 # locations: -> 2221 renderer = _get_renderer( 2222 self.figure, 2223 functools.partial( /usr/local/lib/python3.8/dist-packages/matplotlib/backend_bases.py in _get_renderer(figure, print_method) 1574 figure.canvas._get_output_canvas(None, fmt), f"print_{fmt}") 1575 try: -> 1576 print_method(io.BytesIO()) 1577 except Done as exc: 1578 renderer, = figure._cachedRenderer, = exc.args /usr/local/lib/python3.8/dist-packages/matplotlib/backend_bases.py in wrapper(*args, **kwargs) 1667 kwargs.pop(arg) 1668 -> 1669 return func(*args, **kwargs) 1670 1671 return wrapper /usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in print_png(self, filename_or_obj, metadata, pil_kwargs, *args) 506 *metadata*, including the default 'Software' key. 507 """ --> 508 FigureCanvasAgg.draw(self) 509 mpl.image.imsave( 510 filename_or_obj, self.buffer_rgba(), format="png", origin="upper", /usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in draw(self) 399 def draw(self): 400 # docstring inherited --> 401 self.renderer = self.get_renderer(cleared=True) 402 # Acquire a lock on the shared font cache. 403 with RendererAgg.lock, \ /usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in get_renderer(self, cleared) 415 and getattr(self, "_lastKey", None) == key) 416 if not reuse_renderer: --> 417 self.renderer = RendererAgg(w, h, self.figure.dpi) 418 self._lastKey = key 419 elif cleared: /usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in __init__(self, width, height, dpi) 89 self.width = width 90 self.height = height ---> 91 self._renderer = _RendererAgg(int(width), int(height), dpi) 92 self._filter_renderers = [] 93 ValueError: Image size of 13440x110400 pixels is too large. It must be less than 2^16 in each direction.
<Figure size 6720x55200 with 64 Axes>
sc.settings._vector_friendly=False
sc.pl.pca(padata, color=['CellTypes'], size=300, palette=color_palette, save = '_pseudobulk.pdf')
sc.settings._vector_friendly=True
WARNING: saving figure to file figures/pca_pseudobulk.pdf
padata.write( './3_Pseudobulk_total.h5ad')
padata.layers['normalized'] = padata.X
padata.X = padata.layers['raw']
padata
AnnData object with n_obs × n_vars = 19 × 12733 obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'seqRun', 'original_name', 'SC_derivation', 'micoplasma', 'scDblFinder_class', 'CellTypes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'lib_size', 'log_lib_size' var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts' uns: 'log1p', 'pca', 'sample_id_colors', 'run_id_colors', 'probe_barcode_ids_colors', 'subject_colors', 'line_colors', 'specimen_colors', 'stage_colors', 'condition_colors', 'seqRun_colors', 'original_name_colors', 'SC_derivation_colors', 'micoplasma_colors', 'scDblFinder_class_colors', 'CellTypes_colors' obsm: 'X_pca' varm: 'PCs' layers: 'raw', 'normalized'
total = pd.DataFrame(padata.layers['normalized'].T, columns = padata.obs_names, index = padata.var_names)
total.index = padata.var_names
total.columns = padata.obs_names
sns.clustermap(total.corr(method = 'spearman'), vmin = 0, vmax = 1, cmap = 'PuBu')
<seaborn.matrix.ClusterGrid at 0x7f4e785a8700>
spearman_corr = total.corr(method='spearman')
cbar_kws = {"shrink": 0.75, "label": "Spearman Correlation coefficient"}
plt.tight_layout(pad=0.7, w_pad=2, h_pad=4)
clustermap = sns.clustermap(spearman_corr, vmin=0, vmax=1, cmap='PuBu', annot=False, fmt=".2f",
cbar_kws=cbar_kws, figsize=(20, 20))
clustermap.fig.suptitle('Spearman Correlation Coefficient between Cell Types', fontsize=30, y=1.01)
plt.savefig('./figures/Spearman_corr_celltypes.pdf')
plt.show()
<Figure size 2400x2400 with 0 Axes>
For fig. 2B and 2L of the paper
padata.X = padata.layers['normalized']
padata.X = sparse.csr_matrix(padata.X)
plot_box(
padata,
['NANOS3', 'SOX17', 'CD38', 'TFAP2C', 'DNMT3L'],
"CellTypes",
order = ['hiPSC', 'iMeLC', 'hPGCLC', 'hEGCLC'],
ncols=5,
row_height = 4,
save = './figures/boxplots_hPGCLC_up_markers.pdf'
)
plot_box(
padata,
['SOX2', 'SOX3', 'ZIC5', 'SALL1', 'DNMT3B'],
"CellTypes",
order = ['hiPSC', 'iMeLC', 'hPGCLC', 'hEGCLC'],
ncols=5,
row_height = 4,
save = './figures/boxplots_hPGCLC_down_markers.pdf'
)
plot_box(
padata,
['PIWIL2'],
"CellTypes",
order = ['hiPSC', 'iMeLC', 'hPGCLC', 'hEGCLC'],
ncols=4,
row_height = 4,
save = './figures/boxplots_PIWIL2.pdf'
)
For Fig. 4G and Supp. Fig. 4B:
selected_cell_types = ["hiPSC", "iMeLC", "hPGCLC", "hEGCLC"]
selected = padata[padata.obs['CellTypes'].isin(selected_cell_types)].copy()
selected.obs['CellTypes'] = selected.obs['CellTypes'].astype('category')
selected.obs['CellTypes'].cat.reorder_categories(selected_cell_types, inplace=True)
TF_DM_DE = ['ELF3', 'ELK3', 'FOXP1', 'GATA3', 'LMO2', 'MSX2',
'OVOL2', 'PRRX2', 'RARB', 'TCF7', 'ZIC5']
meth = ["DNMT3A", "DNMT3B", "DNMT1", "UHRF1", "DNMT3L", "TET1", "TET2", "TET3", "ZFP57", "ZNF445"]
TF_DM_DE = CleanList(TF_DM_DE, padata)
meth = CleanList(meth, padata)
ax = sc.pl.heatmap(
selected,
TF_DM_DE,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (3,4),
standard_scale = 'var', show = False
)
plt.savefig('TF_DM_DE_heatmap.pdf', dpi=600, bbox_inches = 'tight')
plt.show()
ax = sc.pl.heatmap(
selected,
meth,
groupby = 'CellTypes',
cmap = 'PuBu',
swap_axes = True,
figsize = (3,4),
standard_scale = 'var', show = False
)
plt.savefig('meth_heatmap.pdf', dpi=600, bbox_inches = 'tight')
plt.show()
padata_sub = padata[padata.obs.specimen.isin(['hEGCLC', 'hiPSC']), :].copy()
padata_sub.X = padata_sub.layers['raw'].copy()
sc.pp.normalize_total(padata_sub, target_sum=1e6)
sc.pp.log1p(padata_sub)
sc.tl.pca(padata_sub)
WARNING: adata.X seems to be already log-transformed.
sc.pl.pca(padata_sub, color=['CellTypes'], ncols=2, size=300, components = ['1,2', '2,3'], wspace = 0.4)
sc.pl.pca(padata_sub, color=['specimen'], ncols=2, size=300, components = ['1,2', '2,3', '3,4'], wspace = 0.4)
sc.pl.pca_loadings(padata_sub)
sc.pl.pca_variance_ratio(padata_sub)
Genes more positively associated with hEGCLCs:
pcs_genes = pd.DataFrame(padata_sub.varm['PCs'][:,0])
pcs_genes.index = padata_sub.var_names
pcs_genes.columns = ['PC1']
pcs_genes.sort_values(by = 'PC1', ascending = True).head(20)
PC1 | |
---|---|
SERPINE1 | -0.065397 |
PIWIL2 | -0.059823 |
MAGEA4 | -0.059599 |
PPP1R17 | -0.057174 |
TMSB4Y | -0.056529 |
ZFP57 | -0.055025 |
RBM46 | -0.054333 |
MT1H | -0.053016 |
ZBED6CL | -0.052213 |
CRYAB | -0.051795 |
DPF1 | -0.051680 |
BTBD17 | -0.050979 |
SPIB | -0.050698 |
HSPB2 | -0.048831 |
INPP5J | -0.048220 |
GNG3 | -0.048191 |
CACNB3 | -0.048130 |
AHNAK | -0.047788 |
UTP14A | -0.047682 |
NTNG1 | -0.047636 |