import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sparse

import decoupler as dc
import sys

sys.path.append("./../../../utilities_folder/")

from utilities import CleanList, PlotMarkers, computeGeneScores, convert_to_html, link_table, filter_by_expr, plot_box


%matplotlib inline
sc.set_figure_params(color_map='PuBu', transparent=True, dpi = 600)

plt.rcParams['pdf.fonttype'] = 'truetype'

color_palette = {
    'iMeLC': '#377eb8',
    'hEGCLC': '#ff7f00',
    'hPGCLC': '#4daf4a',
    'hiPSC': '#dd1c77',
    'HEP': '#a65628',
    'MLC': '#f781bf',
    'EndLC': '#999999',
     'AmLC': '#80b1d3',
}


adata = sc.read("./../01_Annotation/1_All_Annotated_Triku.h5ad")
adata

AnnData object with n_obs × n_vars = 27925 × 14582
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'notes', 'seqRun', 'original_name', 'project', 'who', 'SC_derivation', 'micoplasma', 'mosaic', 'genSite', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'gene_UMI_ratio', 'log1p_gene_UMI_ratio', 'scDblFinder_score', 'scDblFinder_class', 'n_counts', 'n_genes', 'leiden', 'score_pluripotency', 'score_mesoderm', 'score_PS', 'score_EM', 'score_AM', 'score_Endo', 'score_Epi', 'score_HEP', 'score_NM', 'score_ExM', 'score_Ery', 'score_EAE', 'score_Amnion_LC', 'score_AxM', 'score_Amnion', 'score_NNE', 'score_PGC', 'score_hPGCLCs', 'score_DE1', 'score_DE2', 'score_Hypoblast', 'score_YS', 'score_EMPs', 'score_Endothelium', 'score_Myeloid_Progenitors', 'score_Megakaryocyte_Erythroid_progenitors', 'CellTypes'
    var: 'n_cells', 'mito', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'triku_distance', 'triku_distance_uncorrected', 'triku_highly_variable'
    uns: 'CellTypes_colors', 'dendrogram_CellTypes', 'dendrogram_leiden', 'dendrogram_specimen', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'rank_genes_groups', 'run_id_colors', 'sample_id_colors', 'specimen_colors', 'stage_colors', 'triku_params', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities', 'distances'


adata.obs.CellTypes.value_counts()

hEGCLC    7733
MLC       7448
hiPSC     6900
iMeLC     3721
hPGCLC    1284
AmLC       671
EndLC      114
HEP         54
Name: CellTypes, dtype: int64


adata.obs.specimen.value_counts()

hPGCLC aggregate    9571
hEGCLC              7733
hiPSC               6900
iMeLC               3721
Name: specimen, dtype: int64


adata.obs.run_id.value_counts()

G01/       18340
H01_WA/     5430
A01/        4155
Name: run_id, dtype: int64


# Get pseudo-bulk profile
padata = dc.get_pseudobulk(adata, sample_col='sample_id', groups_col='CellTypes', layer='raw', min_prop=0.1, min_smpls=1)


del padata.obs["notes"]
del padata.obs["mosaic"]
del padata.obs["total_counts_ribo"]
del padata.obs["log1p_total_counts_ribo"]
del padata.obs["pct_counts_ribo"]
padata.write("./3_PseudobulkRaw.h5ad")


padata.obs = padata.obs.drop(["genSite", "who", "project"], axis = 1)
padata.obs


# Find mito and ribo genes
mito_genes = padata.var_names.str.startswith("MT-")

# qc_vars wants a column of adataata.var containing T/F or 1/0 indicating the genes to be selected for sub-statistics
padata.var["mito"] = padata.var_names.str.startswith("MT-")

# Compute metrics (inplace=True to append to adata)
sc.pp.calculate_qc_metrics(
    padata, log1p=True, qc_vars=["mito"], inplace=True, percent_top=None
)


n_cells = adata.obs[['CellTypes', 'sample_id']].value_counts().reset_index()
n_cells


padata.obs["lib_size"] = np.sum(padata.X, axis=1)
padata.obs["log_lib_size"] = np.log(padata.obs["lib_size"])


for ct in padata.obs.CellTypes.unique():
    sub = n_cells[n_cells.CellTypes == ct]
    sub.sample_id = sub.sample_id.astype('str')
    sns.barplot(y = sub[0], x = sub.sample_id )
    plt.title(ct)
    plt.show()


genes_to_keep = filter_by_expr(padata, group = 'CellTypes', min_count = 10, min_total_count = 200, min_prop=0)
padata = padata[:, genes_to_keep]

padata.layers['raw'] = padata.X

sc.pp.normalize_total(padata, target_sum = 1e6)
sc.pp.log1p(padata)

obs = padata.obs


padata

AnnData object with n_obs × n_vars = 19 × 12733
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'seqRun', 'original_name', 'SC_derivation', 'micoplasma', 'scDblFinder_class', 'CellTypes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'lib_size', 'log_lib_size'
    var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'log1p'
    layers: 'raw'


sc.tl.pca(padata)


sc.pl.pca(padata, color=padata.obs, ncols=2, size=300, components = ['1,2', '2,3'], wspace = 0.4)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.8/dist-packages/IPython/core/formatters.py in __call__(self, obj)
    339                 pass
    340             else:
--> 341                 return printer(obj)
    342             # Finally look for special method names
    343             method = get_real_method(obj, self.print_method)

/usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py in <lambda>(fig)
    253         png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))
    254     if 'retina' in formats or 'png2x' in formats:
--> 255         png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs))
    256     if 'jpg' in formats or 'jpeg' in formats:
    257         jpg_formatter.for_type(Figure, lambda fig: print_figure(fig, 'jpg', **kwargs))

/usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py in retina_figure(fig, **kwargs)
    143 def retina_figure(fig, **kwargs):
    144     """format a figure as a pixel-doubled (retina) PNG"""
--> 145     pngdata = print_figure(fig, fmt='retina', **kwargs)
    146     # Make sure that retina_figure acts just like print_figure and returns
    147     # None when the figure is empty.

/usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py in print_figure(fig, fmt, bbox_inches, **kwargs)
    135         FigureCanvasBase(fig)
    136 
--> 137     fig.canvas.print_figure(bytes_io, **kw)
    138     data = bytes_io.getvalue()
    139     if fmt == 'svg':

/usr/local/lib/python3.8/dist-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs)
   2219                 # CL works.  "tight" also needs a draw to get the right
   2220                 # locations:
-> 2221                 renderer = _get_renderer(
   2222                     self.figure,
   2223                     functools.partial(

/usr/local/lib/python3.8/dist-packages/matplotlib/backend_bases.py in _get_renderer(figure, print_method)
   1574                 figure.canvas._get_output_canvas(None, fmt), f"print_{fmt}")
   1575         try:
-> 1576             print_method(io.BytesIO())
   1577         except Done as exc:
   1578             renderer, = figure._cachedRenderer, = exc.args

/usr/local/lib/python3.8/dist-packages/matplotlib/backend_bases.py in wrapper(*args, **kwargs)
   1667             kwargs.pop(arg)
   1668 
-> 1669         return func(*args, **kwargs)
   1670 
   1671     return wrapper

/usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in print_png(self, filename_or_obj, metadata, pil_kwargs, *args)
    506             *metadata*, including the default 'Software' key.
    507         """
--> 508         FigureCanvasAgg.draw(self)
    509         mpl.image.imsave(
    510             filename_or_obj, self.buffer_rgba(), format="png", origin="upper",

/usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in draw(self)
    399     def draw(self):
    400         # docstring inherited
--> 401         self.renderer = self.get_renderer(cleared=True)
    402         # Acquire a lock on the shared font cache.
    403         with RendererAgg.lock, \

/usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in get_renderer(self, cleared)
    415                           and getattr(self, "_lastKey", None) == key)
    416         if not reuse_renderer:
--> 417             self.renderer = RendererAgg(w, h, self.figure.dpi)
    418             self._lastKey = key
    419         elif cleared:

/usr/local/lib/python3.8/dist-packages/matplotlib/backends/backend_agg.py in __init__(self, width, height, dpi)
     89         self.width = width
     90         self.height = height
---> 91         self._renderer = _RendererAgg(int(width), int(height), dpi)
     92         self._filter_renderers = []
     93 

ValueError: Image size of 13440x110400 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 6720x55200 with 64 Axes>


sc.settings._vector_friendly=False
sc.pl.pca(padata, color=['CellTypes'], size=300, palette=color_palette, save = '_pseudobulk.pdf')
sc.settings._vector_friendly=True

WARNING: saving figure to file figures/pca_pseudobulk.pdf


padata.write( './3_Pseudobulk_total.h5ad')


padata.layers['normalized'] = padata.X
padata.X = padata.layers['raw']


padata

AnnData object with n_obs × n_vars = 19 × 12733
    obs: 'sample_id', 'run_id', 'probe_barcode_ids', 'subject', 'line', 'specimen', 'stage', 'condition', 'seqRun', 'original_name', 'SC_derivation', 'micoplasma', 'scDblFinder_class', 'CellTypes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'lib_size', 'log_lib_size'
    var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'log1p', 'pca', 'sample_id_colors', 'run_id_colors', 'probe_barcode_ids_colors', 'subject_colors', 'line_colors', 'specimen_colors', 'stage_colors', 'condition_colors', 'seqRun_colors', 'original_name_colors', 'SC_derivation_colors', 'micoplasma_colors', 'scDblFinder_class_colors', 'CellTypes_colors'
    obsm: 'X_pca'
    varm: 'PCs'
    layers: 'raw', 'normalized'


total = pd.DataFrame(padata.layers['normalized'].T, columns = padata.obs_names, index = padata.var_names)

total.index = padata.var_names
total.columns = padata.obs_names


sns.clustermap(total.corr(method = 'spearman'), vmin = 0, vmax = 1, cmap = 'PuBu')

<seaborn.matrix.ClusterGrid at 0x7f4e785a8700>


spearman_corr = total.corr(method='spearman')

cbar_kws = {"shrink": 0.75, "label": "Spearman Correlation coefficient"}
plt.tight_layout(pad=0.7, w_pad=2, h_pad=4)

clustermap = sns.clustermap(spearman_corr, vmin=0, vmax=1, cmap='PuBu', annot=False, fmt=".2f", 
                            cbar_kws=cbar_kws, figsize=(20, 20))

clustermap.fig.suptitle('Spearman Correlation Coefficient between Cell Types', fontsize=30, y=1.01)

plt.savefig('./figures/Spearman_corr_celltypes.pdf')
plt.show()

<Figure size 2400x2400 with 0 Axes>


padata.X = padata.layers['normalized']
padata.X = sparse.csr_matrix(padata.X)


plot_box(
    padata,
   ['NANOS3', 'SOX17', 'CD38', 'TFAP2C', 'DNMT3L'],
    "CellTypes",
    order = ['hiPSC', 'iMeLC', 'hPGCLC', 'hEGCLC'],
    ncols=5,
    row_height = 4,
    save = './figures/boxplots_hPGCLC_up_markers.pdf'
)


plot_box(
    padata,
   ['SOX2', 'SOX3', 'ZIC5', 'SALL1', 'DNMT3B'],
    "CellTypes",
    order = ['hiPSC', 'iMeLC', 'hPGCLC', 'hEGCLC'],
    ncols=5,
    row_height = 4,
    save = './figures/boxplots_hPGCLC_down_markers.pdf'
)


plot_box(
    padata,
   ['PIWIL2'],
    "CellTypes",
    order = ['hiPSC', 'iMeLC', 'hPGCLC', 'hEGCLC'],
    ncols=4,
    row_height = 4,
    save = './figures/boxplots_PIWIL2.pdf'
)


selected_cell_types = ["hiPSC", "iMeLC", "hPGCLC", "hEGCLC"]  
selected = padata[padata.obs['CellTypes'].isin(selected_cell_types)].copy()
selected.obs['CellTypes'] = selected.obs['CellTypes'].astype('category')
selected.obs['CellTypes'].cat.reorder_categories(selected_cell_types, inplace=True)

TF_DM_DE = ['ELF3', 'ELK3',  'FOXP1', 'GATA3', 'LMO2', 'MSX2', 
            'OVOL2', 'PRRX2', 'RARB', 'TCF7', 'ZIC5']

meth = ["DNMT3A", "DNMT3B", "DNMT1", "UHRF1", "DNMT3L", "TET1", "TET2", "TET3", "ZFP57", "ZNF445"]

TF_DM_DE = CleanList(TF_DM_DE, padata)
meth = CleanList(meth, padata)


ax = sc.pl.heatmap(
              selected,
              TF_DM_DE,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (3,4), 
              standard_scale = 'var', show = False
)

plt.savefig('TF_DM_DE_heatmap.pdf', dpi=600, bbox_inches = 'tight')
plt.show()


ax = sc.pl.heatmap(
              selected,
              meth,
              groupby = 'CellTypes', 
              cmap = 'PuBu', 
              swap_axes = True, 
              figsize = (3,4), 
              standard_scale = 'var', show = False
)

plt.savefig('meth_heatmap.pdf', dpi=600, bbox_inches = 'tight')
plt.show()


padata_sub = padata[padata.obs.specimen.isin(['hEGCLC', 'hiPSC']), :].copy()
padata_sub.X = padata_sub.layers['raw'].copy()


sc.pp.normalize_total(padata_sub, target_sum=1e6)
sc.pp.log1p(padata_sub)
sc.tl.pca(padata_sub)

WARNING: adata.X seems to be already log-transformed.


sc.pl.pca(padata_sub, color=['CellTypes'], ncols=2, size=300, components = ['1,2', '2,3'], wspace = 0.4)


sc.pl.pca(padata_sub, color=['specimen'], ncols=2, size=300, components = ['1,2', '2,3', '3,4'], wspace = 0.4)


sc.pl.pca_loadings(padata_sub)


sc.pl.pca_variance_ratio(padata_sub)


pcs_genes = pd.DataFrame(padata_sub.varm['PCs'][:,0])
pcs_genes.index = padata_sub.var_names
pcs_genes.columns = ['PC1']
pcs_genes.sort_values(by = 'PC1', ascending = True).head(20)

	sample_id	run_id	probe_barcode_ids	subject	line	specimen	stage	condition	seqRun	original_name	SC_derivation	micoplasma	scDblFinder_class	CellTypes
2EB_AmLC	2EB	H01_WA/	BC011	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	2EB	iPSC	NO	singlet	AmLC
3EB_AmLC	3EB	H01_WA/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	3EB	iPSC	NO	singlet	AmLC
5EB_AmLC	5EB	G01/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	G01	5EB	iPSC	NO	singlet	AmLC
5EB_EndLC	5EB	G01/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	G01	5EB	iPSC	NO	singlet	EndLC
2EB_HEP	2EB	H01_WA/	BC011	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	2EB	iPSC	NO	singlet	HEP
3EB_HEP	3EB	H01_WA/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	3EB	iPSC	NO	singlet	HEP
5EB_HEP	5EB	G01/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	G01	5EB	iPSC	NO	singlet	HEP
2EB_MLC	2EB	H01_WA/	BC011	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	2EB	iPSC	NO	singlet	MLC
3EB_MLC	3EB	H01_WA/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	3EB	iPSC	NO	singlet	MLC
5EB_MLC	5EB	G01/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	G01	5EB	iPSC	NO	singlet	MLC
EG2_hEGCLC	EG2	G01/	BC014	CTL08	CTL08A	hEGCLC	day_0	CTL	G01	EG2	iPSC	NO	singlet	hEGCLC
EG31_hEGCLC	EG31	G01/	BC016	CTL08	CTL08A	hEGCLC	day_0	CTL	G01	EG31	iPSC	NO	singlet	hEGCLC
EG9_hEGCLC	EG9	G01/	BC015	CTL08	CTL08A	hEGCLC	day_0	CTL	G01	EG9	iPSC	NO	singlet	hEGCLC
2EB_hPGCLC	2EB	H01_WA/	BC011	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	2EB	iPSC	NO	singlet	hPGCLC
3EB_hPGCLC	3EB	H01_WA/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	H01	3EB	iPSC	NO	singlet	hPGCLC
5EB_hPGCLC	5EB	G01/	BC012	CTL08	CTL08A	hPGCLC aggregate	day_6	CTL	G01	5EB	iPSC	NO	singlet	hPGCLC
IEO_13_A_R_hiPSC	IEO_13_A_R	G01/	BC011	CTL08	CTL08A	hiPSC	day_0	CTL	G01	IEO.13.A.R	iPSC	NO	singlet	hiPSC
ht15_FIX_hiPSC	ht15_FIX	A01/	BC014	CTL08	CTL08A	hiPSC	day_0	CTL	A01	15 FIX	iPSC	NO	singlet	hiPSC
5IM_iMeLC	5IM	G01/	BC013	CTL08	CTL08A	iMeLC	iMeLCs	CTL	G01	5IM	iPSC	NO	singlet	iMeLC

	PC1
SERPINE1	-0.065397
PIWIL2	-0.059823
MAGEA4	-0.059599
PPP1R17	-0.057174
TMSB4Y	-0.056529
ZFP57	-0.055025
RBM46	-0.054333
MT1H	-0.053016
ZBED6CL	-0.052213
CRYAB	-0.051795
DPF1	-0.051680
BTBD17	-0.050979
SPIB	-0.050698
HSPB2	-0.048831
INPP5J	-0.048220
GNG3	-0.048191
CACNB3	-0.048130
AHNAK	-0.047788
UTP14A	-0.047682
NTNG1	-0.047636

Prepare environment¶

Load data¶

Pseudobulk¶

Quality Check¶

Filter genes¶

Correlation between celltypes expression¶

Boxplots of markers¶

Heatmaps of markers¶

Pseudobulk subset - EG vs iPSC¶

	CellTypes	sample_id	0
0	hiPSC	ht15_FIX	4155
1	iMeLC	5IM	3721
2	MLC	5EB	3504
3	hiPSC	IEO_13_A_R	2745
4	hEGCLC	EG9	2669
5	hEGCLC	EG2	2544
6	hEGCLC	EG31	2520
7	MLC	3EB	2023
8	MLC	2EB	1921
9	hPGCLC	2EB	867
10	AmLC	5EB	291
11	AmLC	2EB	270
12	hPGCLC	5EB	216
13	hPGCLC	3EB	201
14	AmLC	3EB	110
15	EndLC	5EB	101
16	HEP	5EB	29
17	HEP	2EB	14
18	HEP	3EB	11
19	EndLC	3EB	9
20	EndLC	2EB	4