In [1]:
import scanpy as sc
import pandas as pd
from matplotlib import pylab
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import itertools
import plotly.express as px
import yaml


import numpy as np
import random
import anndata as ad
from scipy.sparse import  csr_matrix, issparse
from scipy import sparse

from matplotlib.colors import TwoSlopeNorm

import scanpy.external as sce
import sys

import scvelo as scv
In [2]:
import anndata
In [3]:
sc.settings.set_figure_params(dpi=80, facecolor='white', dpi_save=500)
pylab.rcParams['figure.figsize'] = (6, 6)
homeDir = os.getenv("HOME")
sys.path.insert(1, homeDir+"/utils/")

from PlotPCA_components import *
from AtlasClasses import *
DS="hormones_substudy"
ReferenceTissue="cortex"

with open(homeDir+"/utils/ReferenceDict.yaml", 'r') as file:
    ReferencePaths = yaml.safe_load(file)
    for k in list(ReferencePaths.keys()):
        ReferencePaths[k]["adataPath"] = "/group/testa/Users/davide.castaldi/Polaroids_spinoff"+ReferencePaths[k]["adataPath"]
        #ReferencePaths[k]["signaturePath"] = homeDir+ReferencePaths[k]["signaturePath"]
        ReferencePaths[k]["signaturePath"] = "/group/testa/Users/davide.castaldi/Polaroids_spinoff"+ReferencePaths[k]["signaturePath"]
        ReferencePaths[k]["signaturePurityPath"] = "/group/testa/Users/davide.castaldi/Polaroids_spinoff"+ReferencePaths[k]["signaturePurityPath"]


ReferencePaths
Out[3]:
{'cortex': {'dsname': 'Poliudakis2019_cortex',
  'adataPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/cortex.Reference.h5ad',
  'signaturePath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/cortex.SignatureGenes.tsv',
  'signaturePurityPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/cortex.SignaturesPurity.tsv',
  'LabelTtransferAggregation': {'oRG': 'Glia',
   'IP': 'IP',
   'ExcitatoryDeepLayer': 'ExcitatoryNeuron',
   'Cycling': 'Cycling',
   'ExcitatoryMigrating': 'ExcitatoryNeuron',
   'Excitatory': 'ExcitatoryNeuron',
   'Endothelium': 'Vascular',
   'vRG': 'Glia',
   'Inhibitory': 'ForebrainInhibitory',
   'OPC': 'OPC',
   'Mic': 'Mic'},
  'RelevantContrasts': {'Inh_vs_Exc': ['cortex_Inhibitory',
    'cortex_Excitatory'],
   'Exc_vs_cycling': ['cortex_Excitatory', 'cortex_CyclingProgenitors']}},
 'cerebellum': {'dsname': 'Aldinger2021_cerebellum',
  'adataPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/cerebellum.Reference.h5ad',
  'signaturePath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/cerebellum.SignatureGenes.tsv',
  'signaturePurityPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/cerebellum.SignaturesPurity.tsv',
  'LabelTtransferAggregation': {'Astrocytes': 'Glia',
   'Purkinje': 'CerebellarInhibitory',
   'InhibitoryProgenitors': 'CerebellarInhibitory',
   'Inhibitory': 'CerebellarInhibitory',
   'Glia': 'Glia',
   'GranuleNeurons': 'ExcitatoryNeuron',
   'Endothelium': 'Vascular',
   'ExcitatoryInterneuron': 'ExcitatoryNeuron',
   'OPC': 'OPC',
   'Mic': 'Mic',
   'CyclingGranulePrecursos': 'RLCycling',
   'ML_gabaergic': 'CerebellarInhibitory',
   'MLgabaergic': 'CerebellarInhibitory',
   'ML_gabaergic ': 'CerebellarInhibitory'},
  'RelevantContrasts': {'Inh_vs_Exc': ['cerebellum_Inhibitory',
    'cerebellum_Excitatory'],
   'ExcInt_vs_Glia': ['cerebellum_Inhibitory', 'cerebellum_GliaProgenitors']}},
 'subpallium': {'dsname': 'Yu2021_subpallium',
  'adataPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/subpallium.Reference.h5ad',
  'signaturePath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/subpallium.SignatureGenes.tsv',
  'signaturePurityPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/subpallium.SignaturesPurity.tsv',
  'LabelTtransferAggregation': {'Cycling': 'Cycling',
   'Inhibitory': 'Inhibitory',
   'IP': 'IP',
   'Excitatory': 'ExcitatoryNeuron',
   'Endothelium': 'Vascular',
   'Mic': 'Mic',
   'OPC': 'OPC'},
  'RelevantContrasts': {'Cycling_vs_Inh': ['subpallium_CyclingProgenitors',
    'subpallium_Inhibitory']}},
 'thalamus': {'dsname': 'KimSecondTri2023_Thalamus',
  'adataPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/thalamus.Reference.h5ad',
  'signaturePath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/thalamus.SignatureGenes.tsv',
  'signaturePurityPath': '/group/testa/Users/davide.castaldi/Polaroids_spinoff/2_GenerateReferences/thalamus.SignaturesPurity.tsv',
  'LabelTtransferAggregation': {'Cycling': 'Cycling',
   'Inhibitory': 'Inhibitory',
   'IP': 'IP',
   'Excitatory': 'ExcitatoryNeuron',
   'Endothelium': 'Vascular',
   'Mic': 'Mic',
   'OPC': 'OPC'},
  'RelevantContrasts': {'Exc_vs_Astro': ['thalamus_Excitatory',
    'thalamus_GliaProgenitors']}}}

Import¶

In [4]:
adata = sc.read_h5ad("./adatas/4.1.{}.labeltransfer.h5ad".format(DS))
adata.layers["logNorm"] = adata.X.copy()

Scaled cell cycle score¶

In [5]:
sc.pp.scale(adata)
In [6]:
scv.tl.score_genes_cell_cycle(adata)
calculating cell cycle phase
-->     'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)
In [7]:
Compositions = adata.obs.groupby(["sample_id","phase"], as_index=False).size()
Compositions["SampleNormSize"] = Compositions["size"].divide(Compositions.groupby("sample_id")["size"].sum().loc[Compositions.sample_id].to_numpy())
Compositions["Condition"] = pd.crosstab(adata.obs["sample_id"], adata.obs["condition"]).idxmax(axis=1)[Compositions["sample_id"]].tolist()
Compositions["line"] = pd.crosstab(adata.obs["sample_id"], adata.obs["line"]).idxmax(axis=1)[Compositions["sample_id"]].tolist()
Compositions
Out[7]:
sample_id phase size SampleNormSize Condition line
0 1_CTL04E_491 G1 1181 0.428364 ESTROGEN_AGONIST CTL04E
1 1_CTL04E_491 G2M 695 0.252086 ESTROGEN_AGONIST CTL04E
2 1_CTL04E_491 S 881 0.319550 ESTROGEN_AGONIST CTL04E
3 1_CTL04E_492 G1 575 0.268441 ESTROGEN_ANTAGONIST CTL04E
4 1_CTL04E_492 G2M 935 0.436508 ESTROGEN_ANTAGONIST CTL04E
... ... ... ... ... ... ...
250 3_CTL08A_505 G2M 349 0.174152 DMSO CTL08A
251 3_CTL08A_505 S 580 0.289421 DMSO CTL08A
252 3_CTL08A_509 G1 925 0.396315 RET_ANTAGONIST CTL08A
253 3_CTL08A_509 G2M 749 0.320908 RET_ANTAGONIST CTL08A
254 3_CTL08A_509 S 660 0.282776 RET_ANTAGONIST CTL08A

255 rows × 6 columns

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib.patches as mpatches

# Sort the DataFrame by 'Condition' and 'line'
Compositions_sorted = Compositions.sort_values(["Condition", "line"])

# Ensure the sample order is preserved as a categorical type
Compositions_sorted["sample_id"] = pd.Categorical(
    Compositions_sorted["sample_id"],
    Compositions_sorted["sample_id"].unique().tolist()
)

# Plot with sorted x-axis
fig, ax = plt.subplots(1, 1, figsize=(30, 7), dpi=100)
sns.histplot(
    data=Compositions_sorted, 
    x="sample_id", 
    hue="phase", 
    weights="SampleNormSize",
    multiple="stack", 
    shrink=0.8,
    ax=ax
)

ax.set_ylabel("Cells Fraction")
ax.set(ylim=(0, 1))
sns.despine(ax=ax)

# Try to retrieve the auto-generated legend handles/labels for phase
macro_handles, macro_labels = ax.get_legend_handles_labels()
# Remove the auto legend to avoid conflicts.
if ax.get_legend() is not None:
    ax.get_legend().remove()

# If macro_handles is empty, build the legend manually.
if len(macro_handles) == 0:
    macro_unique = Compositions_sorted["phase"].unique()
    # Use a palette (here we use "deep" which is commonly used by seaborn)
    macro_palette = sns.color_palette("deep", n_colors=len(macro_unique))
    macro_dict = {macro: color for macro, color in zip(macro_unique, macro_palette)}
    macro_handles = [mpatches.Patch(color=color, label=macro) for macro, color in macro_dict.items()]
    macro_labels = list(macro_dict.keys())

# Group info by sample_id for Condition and line
group_info = Compositions_sorted.groupby("sample_id").agg({"Condition": "first", "line": "first"})

# Get x positions for categorical data
xticks = ax.get_xticks()
samples_order = list(Compositions_sorted["sample_id"].cat.categories)
sample_to_x = {sample: xtick for sample, xtick in zip(samples_order, xticks)}

# Assign colors for Conditions and lines
unique_conditions = group_info["Condition"].unique()
unique_lines = group_info["line"].unique()
condition_colors = {
    cond: color for cond, color in zip(unique_conditions, sns.color_palette("tab10", n_colors=len(unique_conditions)))
}
line_colors = {
    line: color for line, color in zip(unique_lines, sns.color_palette("pastel", n_colors=len(unique_lines)))
}

# Use a transform that uses data coordinates for x and axes coordinates for y
trans = ax.get_xaxis_transform()

# Add colored bars above the main plot for each sample
for sample, row in group_info.iterrows():
    x_center = sample_to_x[sample]
    rect_width = 1  # each category is assumed to be width 1
    
    ax.add_patch(mpatches.Rectangle((x_center - 0.5, 1.02), rect_width, 0.02,
                                    color=condition_colors[row["Condition"]],
                                    transform=trans,
                                    clip_on=False))
    ax.add_patch(mpatches.Rectangle((x_center - 0.5, 1.05), rect_width, 0.02,
                                    color=line_colors[row["line"]],
                                    transform=trans,
                                    clip_on=False))

# Create custom legends for Condition and line groupings
condition_legend = [mpatches.Patch(color=color, label=cond) for cond, color in condition_colors.items()]
line_legend = [mpatches.Patch(color=color, label=line) for line, color in line_colors.items()]

# Adjust the right margin to make room for legends
fig.subplots_adjust(right=1.5)

# Place legends outside the main axes
legend_macro = ax.legend(handles=macro_handles, labels=macro_labels, title="Phase", loc="upper left", bbox_to_anchor=(0, 1))
ax.add_artist(legend_macro)
legend_condition = ax.legend(handles=condition_legend, title="Condition", loc="upper left", bbox_to_anchor=(.955, .9))
ax.add_artist(legend_condition)
legend_line = ax.legend(handles=line_legend, title="Line", loc="upper left", bbox_to_anchor=(.955, 1.1))
ax.add_artist(legend_line)
ax.grid(False)


plt.xticks(rotation=90)
plt.savefig(f"./figures/{DS}.Prefilt.scaled.CyclingRate.svg", bbox_inches="tight")
plt.show()

Go back to log counts¶

In [9]:
adata.X = adata.layers["logNorm"].copy()
del adata.layers["logNorm"]
In [10]:
# Store consensus call info
adata.obs["Consensus_nIntersection"] = adata.obs[["ingestdLabels","scANVILabels","harmonyLabels"]].apply(lambda rows: rows.value_counts().max(), axis = 1)
adata.obs["Consensus"] = np.where(adata.obs["Consensus_nIntersection"] >= 2, True, False)
adata.obs["Consensus_call"] = adata.obs[["ingestdLabels","scANVILabels","harmonyLabels"]].apply(lambda rows: rows.value_counts().idxmax(), axis = 1)


sc.pp.highly_variable_genes(adata, batch_key="groupCov", flavor="seurat", n_top_genes=4000)
CommonHVGs = adata.var_names[adata.var["highly_variable_nbatches"] == len(adata.obs["groupCov"].unique())].tolist()
print(len(CommonHVGs))
adata.var["highly_variable"] = adata.var_names.isin(CommonHVGs)
adata.var["highly_variable"].sum()
sc.tl.pca(adata)
plotPCA_components(adata, color="line")
203
PlottingParams: figsize:(10, 10) dpi:100 dotsize:10 legend_loc:on data fontsize:8
In [11]:
sc.pp.neighbors(adata, n_pcs=10, n_neighbors=30)
sc.tl.umap(adata)
sc.pl.umap(adata, color=["line","TOP2A","STMN2","VIM","S100B"], size=10, vmin='p1', vmax='p99')

Check additional filters mapping¶

In [12]:
AddedFilters = pd.read_csv("./adatas/{}_AdditionalFilters.tsv".format(DS), sep="\t", index_col=0)

if pd.concat([adata.obs, AddedFilters], axis = 1).loc[adata.obs_names].isnull().any(axis=1).sum() == (adata.obs.shape[0] - AddedFilters.shape[0]):
    print("(Mis)match between Additional filters tsv and anndata as expected, proceeding to merge")
    adata.obs = pd.concat([adata.obs, AddedFilters], axis = 1).loc[adata.obs_names]
    for col in AddedFilters.columns:
        adata.obs[col] = adata.obs[col].fillna(False)
else:
    print("Unexpected dimension mismatch between Additional filters tsv and anndata")
(Mis)match between Additional filters tsv and anndata as expected, proceeding to merge
In [13]:
plotSankey(adata.obs, covs=["Consensus_call"]+AddedFilters.columns.tolist())
In [14]:
plotObs = adata.obs.copy()
plotObs["Consensus_nIntersection"] = plotObs["Consensus_nIntersection"].astype(str)
plotSankey(plotObs, covs=["Consensus_nIntersection"]+AddedFilters.columns.tolist())
In [15]:
# Filter out cells with high criticasl signature scores
adata = adata[adata.obs[[i for i in adata.obs.columns if "PassedQCfilt_ScoreSignature_" in i]].sum(axis = 1) >= 3].copy()
In [17]:
Compositions = adata.obs.groupby(["sample_id","phase"], as_index=False).size()
Compositions["SampleNormSize"] = Compositions["size"].divide(Compositions.groupby("sample_id")["size"].sum().loc[Compositions.sample_id].to_numpy())
Compositions["Condition"] = pd.crosstab(adata.obs["sample_id"], adata.obs["condition"]).idxmax(axis=1)[Compositions["sample_id"]].tolist()
Compositions["line"] = pd.crosstab(adata.obs["sample_id"], adata.obs["line"]).idxmax(axis=1)[Compositions["sample_id"]].tolist()
Compositions
Out[17]:
sample_id phase size SampleNormSize Condition line
0 1_CTL04E_491 G1 1128 0.433346 ESTROGEN_AGONIST CTL04E
1 1_CTL04E_491 G2M 669 0.257011 ESTROGEN_AGONIST CTL04E
2 1_CTL04E_491 S 806 0.309643 ESTROGEN_AGONIST CTL04E
3 1_CTL04E_492 G1 555 0.272192 ESTROGEN_ANTAGONIST CTL04E
4 1_CTL04E_492 G2M 890 0.436488 ESTROGEN_ANTAGONIST CTL04E
... ... ... ... ... ... ...
250 3_CTL08A_505 G2M 342 0.173604 DMSO CTL08A
251 3_CTL08A_505 S 569 0.288832 DMSO CTL08A
252 3_CTL08A_509 G1 908 0.396853 RET_ANTAGONIST CTL08A
253 3_CTL08A_509 G2M 735 0.321241 RET_ANTAGONIST CTL08A
254 3_CTL08A_509 S 645 0.281906 RET_ANTAGONIST CTL08A

255 rows × 6 columns

In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib.patches as mpatches

# Sort the DataFrame by 'Condition' and 'line'
Compositions_sorted = Compositions.sort_values(["Condition", "line"])

# Ensure the sample order is preserved as a categorical type
Compositions_sorted["sample_id"] = pd.Categorical(
    Compositions_sorted["sample_id"],
    Compositions_sorted["sample_id"].unique().tolist()
)

# Plot with sorted x-axis
fig, ax = plt.subplots(1, 1, figsize=(30, 7), dpi=100)
sns.histplot(
    data=Compositions_sorted, 
    x="sample_id", 
    hue="phase", 
    weights="SampleNormSize",
    multiple="stack", 
    shrink=0.8,
    ax=ax
)

ax.set_ylabel("Cells Fraction")
ax.set(ylim=(0, 1))
sns.despine(ax=ax)

# Try to retrieve the auto-generated legend handles/labels for phase
macro_handles, macro_labels = ax.get_legend_handles_labels()
# Remove the auto legend to avoid conflicts.
if ax.get_legend() is not None:
    ax.get_legend().remove()

# If macro_handles is empty, build the legend manually.
if len(macro_handles) == 0:
    macro_unique = Compositions_sorted["phase"].unique()
    # Use a palette (here we use "deep" which is commonly used by seaborn)
    macro_palette = sns.color_palette("deep", n_colors=len(macro_unique))
    macro_dict = {macro: color for macro, color in zip(macro_unique, macro_palette)}
    macro_handles = [mpatches.Patch(color=color, label=macro) for macro, color in macro_dict.items()]
    macro_labels = list(macro_dict.keys())

# Group info by sample_id for Condition and line
group_info = Compositions_sorted.groupby("sample_id").agg({"Condition": "first", "line": "first"})

# Get x positions for categorical data
xticks = ax.get_xticks()
samples_order = list(Compositions_sorted["sample_id"].cat.categories)
sample_to_x = {sample: xtick for sample, xtick in zip(samples_order, xticks)}

# Assign colors for Conditions and lines
unique_conditions = group_info["Condition"].unique()
unique_lines = group_info["line"].unique()
condition_colors = {
    cond: color for cond, color in zip(unique_conditions, sns.color_palette("tab10", n_colors=len(unique_conditions)))
}
line_colors = {
    line: color for line, color in zip(unique_lines, sns.color_palette("pastel", n_colors=len(unique_lines)))
}

# Use a transform that uses data coordinates for x and axes coordinates for y
trans = ax.get_xaxis_transform()

# Add colored bars above the main plot for each sample
for sample, row in group_info.iterrows():
    x_center = sample_to_x[sample]
    rect_width = 1  # each category is assumed to be width 1
    
    ax.add_patch(mpatches.Rectangle((x_center - 0.5, 1.02), rect_width, 0.02,
                                    color=condition_colors[row["Condition"]],
                                    transform=trans,
                                    clip_on=False))
    ax.add_patch(mpatches.Rectangle((x_center - 0.5, 1.05), rect_width, 0.02,
                                    color=line_colors[row["line"]],
                                    transform=trans,
                                    clip_on=False))

# Create custom legends for Condition and line groupings
condition_legend = [mpatches.Patch(color=color, label=cond) for cond, color in condition_colors.items()]
line_legend = [mpatches.Patch(color=color, label=line) for line, color in line_colors.items()]

# Adjust the right margin to make room for legends
fig.subplots_adjust(right=1.5)

# Place legends outside the main axes
legend_macro = ax.legend(handles=macro_handles, labels=macro_labels, title="Phase", loc="upper left", bbox_to_anchor=(0, 1))
ax.add_artist(legend_macro)
legend_condition = ax.legend(handles=condition_legend, title="Condition", loc="upper left", bbox_to_anchor=(.955, .9))
ax.add_artist(legend_condition)
legend_line = ax.legend(handles=line_legend, title="Line", loc="upper left", bbox_to_anchor=(.955, 1.1))
ax.add_artist(legend_line)
ax.grid(False)


plt.xticks(rotation=90)
plt.savefig(f"./figures/{DS}.Prefilt.scaled.CyclingRate.svg", bbox_inches="tight")
plt.show()
In [16]:
adata.write("./adatas/5.1.{}.PostQC.h5ad".format(DS))