import scanpy as sc
import pandas as pd
from matplotlib import pylab
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import itertools
import numpy as np
import random
import anndata as ad
import hashlib
from scipy.sparse import csr_matrix, issparse
from scipy import sparse
from matplotlib.colors import TwoSlopeNorm
import scanpy.external as sce
import sys
import anndata
anndata.__version__
'0.10.8'
sc.settings.set_figure_params(dpi=100, facecolor='white', dpi_save=500)
pylab.rcParams['figure.figsize'] = (9, 9)
homeDir = os.getenv("HOME")
sys.path.insert(1, homeDir+"/utils/")
from PlotPCA_components import *
from PurgeAdata import *
DS="Endpoints"
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
/tmp/ipykernel_959918/1930539381.py:4: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter. anndata2ri.activate()
adata = sc.read_h5ad('./adatas/combined_scRNA_seq_data_with_genetic_demux_filtered_consistent.h5ad')
adata
AnnData object with n_obs × n_vars = 480500 × 36601 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run' var: 'gene_ids', 'feature_types', 'genome'
pd.crosstab(adata.obs.condition, adata.obs.run)[[393, 394, 505]]
run | 393 | 394 | 505 |
---|---|---|---|
condition | |||
3-PBA | 0 | 0 | 0 |
ANDROGEN_AGONIST | 0 | 0 | 0 |
ANDROGEN_ANTAGONIST | 0 | 0 | 0 |
ARYL_HYD_AGONIST | 0 | 0 | 0 |
ARYL_HYD_ANTAGONIST | 0 | 0 | 0 |
BPA | 0 | 0 | 0 |
BPF | 0 | 0 | 0 |
DMSO | 0 | 0 | 17302 |
DPHP | 0 | 0 | 0 |
ESTROGEN_AGONIST | 0 | 0 | 0 |
ESTROGEN_ANTAGONIST | 0 | 0 | 0 |
GLUCOCORT_AGONIST | 0 | 0 | 0 |
GLUCOCORT_ANTAGONIST | 0 | 0 | 0 |
LIVER-X_AGONIST | 0 | 0 | 0 |
LIVER-X_ANTAGONIST | 0 | 0 | 0 |
MBzP | 0 | 0 | 0 |
MEP | 0 | 0 | 0 |
MIX | 0 | 0 | 0 |
MTR_393_3GEX | 25326 | 0 | 0 |
MTR_394_3GEX | 0 | 32030 | 0 |
RET_AGONIST | 0 | 0 | 0 |
RET_ANTAGONIST | 0 | 0 | 0 |
TCP | 0 | 0 | 0 |
THYROID_AGONIST | 0 | 0 | 0 |
THYROID_ANTAGONIST | 0 | 0 | 0 |
pd.crosstab(adata.obs.line, adata.obs.run)[[393, 394, 505]]
run | 393 | 394 | 505 |
---|---|---|---|
line | |||
CTL01A | 6163 | 7543 | 0 |
CTL02A | 4599 | 5235 | 0 |
CTL04E | 8407 | 10443 | 8292 |
CTL08A | 6157 | 8809 | 9010 |
pd.crosstab(adata.obs.replicate, adata.obs.line)
line | CTL04E | CTL08A |
---|---|---|
replicate | ||
1 | 63080 | 69595 |
2 | 58454 | 66143 |
3 | 64737 | 73561 |
pd.crosstab(adata.obs.replicate, adata.obs.condition).T
replicate | 1 | 2 | 3 |
---|---|---|---|
condition | |||
3-PBA | 5516 | 4976 | 5548 |
ANDROGEN_AGONIST | 5775 | 4387 | 5627 |
ANDROGEN_ANTAGONIST | 8005 | 7002 | 6906 |
ARYL_HYD_AGONIST | 5856 | 5258 | 5615 |
ARYL_HYD_ANTAGONIST | 8193 | 8478 | 8845 |
BPA | 5333 | 4088 | 4798 |
BPF | 5074 | 4936 | 5286 |
DMSO | 5276 | 5977 | 6049 |
DPHP | 7290 | 7285 | 7773 |
ESTROGEN_AGONIST | 7631 | 7018 | 7223 |
ESTROGEN_ANTAGONIST | 6069 | 6224 | 7296 |
GLUCOCORT_AGONIST | 4866 | 4674 | 5506 |
GLUCOCORT_ANTAGONIST | 6598 | 4527 | 5502 |
LIVER-X_AGONIST | 5735 | 6047 | 6110 |
LIVER-X_ANTAGONIST | 5869 | 5941 | 6942 |
MBzP | 6543 | 6236 | 7592 |
MEP | 5279 | 5038 | 5096 |
MIX | 4899 | 6783 | 6816 |
RET_ANTAGONIST | 6809 | 2748 | 6100 |
TCP | 5943 | 5745 | 6464 |
THYROID_AGONIST | 5236 | 4702 | 4381 |
THYROID_ANTAGONIST | 4880 | 6527 | 6823 |
# Fill nas replicates
adata.obs["replicate"] = adata.obs["replicate"].astype(str).replace("nan",1)
adata.obs["sample_id"] = adata.obs.replicate.astype(str) +"_" +adata.obs.line.astype(str)+"_" +adata.obs.run.astype(str)
adata.obs["groupCov"] = adata.obs["sample_id"]
adata.obs["harmonizedRegion"] = "CBO"
adata.obs["tissue"] = "CBO"
adata.obs["PCW"] = 50
adata
AnnData object with n_obs × n_vars = 480500 × 36601 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW' var: 'gene_ids', 'feature_types', 'genome'
The Kernel crashed while executing code in the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.
#min_cells = 3
min_genes = 100
print(adata.shape)
#adata = adata[:,(adata.X > 0 ).sum(axis = 0) > min_cells]
adata = adata[(adata.X > 0 ).sum(axis = 1) > min_genes]
print(adata.shape)
(480500, 36601)
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
The rpy2.ipython extension is already loaded. To reload it, use: %reload_ext rpy2.ipython
/tmp/ipykernel_518427/3313611837.py:4: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter. anndata2ri.activate()
%%R
library(scDblFinder)
set.seed(1)
library(BiocParallel)
[1] "value"
scDblFinder = rpy2.robjects.packages.importr('scDblFinder')
as_data_frame = rpy2.robjects.r['as.data.frame']
Xtab = pd.crosstab(adata.obs["sample_id"],adata.obs["condition"])
SingletsHash = hashlib.sha256(Xtab.sort_index(axis=1).to_csv(index=True).encode('utf-8')).hexdigest()
hashFile = "./adatas/Singlets.{}.xlsx".format(SingletsHash)
if os.path.isfile(hashFile):
print(f"Precomputed singlets found, retrieving {SingletsHash}")
Singlets = pd.read_excel(hashFile)[0].tolist()
else:
Singlets = []
for i in adata.obs["groupCov"].unique().tolist():
adataLocal = adata[adata.obs["groupCov"] == i].copy()
adataLocal.X = adataLocal.X.astype(np.float32)
adataLocal.layers["counts"] = adataLocal.X.copy()
adataLocal = PurgeAdata(adataLocal)
del adataLocal.obs
del adataLocal.var
sce = anndata2ri.py2rpy(adataLocal)
sce = scDblFinder.scDblFinder(sce)
Singlets.extend(sce.obs_names[sce.obs["scDblFinder.class"] == "singlet"].tolist())
print(f"Done with sample {i}")
pd.Series(Singlets).to_excel(hashFile)
Done with sample 3_CTL08A_501 Done with sample 2_CTL08A_501 Done with sample 3_CTL04E_501 Done with sample 1_CTL04E_501 Done with sample 2_CTL04E_501 Done with sample 1_CTL08A_501 Done with sample 1_CTL08A_506 Done with sample 3_CTL04E_506 Done with sample 2_CTL08A_506 Done with sample 2_CTL04E_506 Done with sample 1_CTL04E_506 Done with sample 3_CTL08A_506 Done with sample 3_CTL08A_505 Done with sample 1_CTL08A_505 Done with sample 3_CTL04E_505 Done with sample 2_CTL04E_505 Done with sample 1_CTL04E_505 Done with sample 2_CTL08A_505 Done with sample 3_CTL04E_510 Done with sample 3_CTL08A_510 Done with sample 1_CTL04E_510 Done with sample 2_CTL08A_510 Done with sample 1_CTL08A_510 Done with sample 2_CTL04E_510 Done with sample 2_CTL04E_512 Done with sample 3_CTL08A_512 Done with sample 3_CTL04E_512 Done with sample 1_CTL08A_512 Done with sample 2_CTL08A_512 Done with sample 1_CTL04E_512 Done with sample 2_CTL08A_498 Done with sample 3_CTL08A_498 Done with sample 2_CTL04E_498 Done with sample 3_CTL04E_498 Done with sample 1_CTL04E_498 Done with sample 1_CTL08A_498 Done with sample 1_CTL08A_499 Done with sample 2_CTL08A_499 Done with sample 3_CTL04E_499 Done with sample 3_CTL08A_499 Done with sample 2_CTL04E_499 Done with sample 1_CTL04E_499 Done with sample 3_CTL08A_497 Done with sample 3_CTL04E_497 Done with sample 1_CTL08A_497 Done with sample 1_CTL04E_497 Done with sample 2_CTL08A_497 Done with sample 2_CTL04E_497 Done with sample 3_CTL04E_509 Done with sample 2_CTL04E_509 Done with sample 1_CTL08A_509 Done with sample 1_CTL04E_509 Done with sample 3_CTL08A_509 Done with sample 2_CTL08A_503 Done with sample 1_CTL08A_503 Done with sample 3_CTL04E_503 Done with sample 3_CTL08A_503 Done with sample 1_CTL04E_503 Done with sample 2_CTL04E_503 Done with sample 2_CTL08A_495 Done with sample 1_CTL08A_495 Done with sample 3_CTL04E_495 Done with sample 3_CTL08A_495 Done with sample 2_CTL04E_495 Done with sample 1_CTL04E_495 Done with sample 3_CTL08A_513 Done with sample 3_CTL04E_513 Done with sample 1_CTL04E_513 Done with sample 2_CTL08A_513 Done with sample 2_CTL04E_513 Done with sample 1_CTL08A_513 Done with sample 3_CTL08A_511 Done with sample 3_CTL04E_511 Done with sample 1_CTL08A_511 Done with sample 2_CTL04E_511 Done with sample 1_CTL04E_511 Done with sample 2_CTL08A_511 Done with sample 3_CTL08A_491 Done with sample 1_CTL04E_491 Done with sample 2_CTL08A_491 Done with sample 1_CTL08A_491 Done with sample 2_CTL04E_491 Done with sample 3_CTL04E_491 Done with sample 1_CTL04E_493 Done with sample 1_CTL08A_493 Done with sample 3_CTL04E_493 Done with sample 2_CTL08A_493 Done with sample 2_CTL04E_493 Done with sample 3_CTL08A_493 Done with sample 1_CTL08A_502 Done with sample 2_CTL04E_502 Done with sample 3_CTL08A_502 Done with sample 3_CTL04E_502 Done with sample 2_CTL08A_502 Done with sample 1_CTL04E_502 Done with sample 3_CTL04E_500 Done with sample 2_CTL04E_500 Done with sample 3_CTL08A_500 Done with sample 1_CTL08A_500 Done with sample 2_CTL08A_500 Done with sample 1_CTL04E_500 Done with sample 3_CTL04E_507 Done with sample 1_CTL08A_507 Done with sample 3_CTL08A_507 Done with sample 2_CTL08A_507 Done with sample 1_CTL04E_507 Done with sample 2_CTL04E_507 Done with sample 1_CTL08A_492 Done with sample 1_CTL04E_492 Done with sample 3_CTL08A_492 Done with sample 2_CTL04E_492 Done with sample 3_CTL04E_492 Done with sample 2_CTL08A_492 Done with sample 2_CTL04E_494 Done with sample 3_CTL08A_494 Done with sample 2_CTL08A_494 Done with sample 3_CTL04E_494 Done with sample 1_CTL04E_494 Done with sample 1_CTL08A_494 Done with sample 1_CTL08A_504 Done with sample 3_CTL04E_504 Done with sample 2_CTL04E_504 Done with sample 3_CTL08A_504 Done with sample 1_CTL04E_504 Done with sample 2_CTL08A_504 Done with sample 3_CTL08A_496 Done with sample 1_CTL08A_496 Done with sample 2_CTL04E_496 Done with sample 1_CTL04E_496 Done with sample 3_CTL04E_496 Done with sample 2_CTL08A_496 Done with sample 1_CTL08A_508 Done with sample 1_CTL04E_508 Done with sample 1_CTL08A_393 Done with sample 1_CTL01A_393 Done with sample 1_CTL04E_393 Done with sample 1_CTL02A_393 Done with sample 1_CTL01A_394 Done with sample 1_CTL04E_394 Done with sample 1_CTL08A_394 Done with sample 1_CTL02A_394
adata = adata[Singlets]
adata
View of AnnData object with n_obs × n_vars = 450811 × 36601 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW' var: 'gene_ids', 'feature_types', 'genome'
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(
adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)
/tmp/ipykernel_518427/3813780412.py:2: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual. adata.var["mt"] = adata.var_names.str.startswith("MT-")
FilterGroup = "groupCov"
import math
goodBClist = {}
########################################################### RIBO genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())
var = "pct_counts_ribo"
n = 0
goodBClist[var] = []
for i in adata.obs[FilterGroup].unique():
AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
mean = AdataTMP.obs[var].mean()
mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
max = mean+3*mad
axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
n = n+1
goodBClist[var] = set(goodBClist[var])
for i in range(excess_axes_count):
fig.delaxes(axes.flatten()[-(i+1)])
########################################################### MITO genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
var = "pct_counts_mt"
goodBClist[var] = []
n = 0
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
mean = AdataTMP.obs[var].mean()
mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
max = mean+3*mad
axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
n = n+1
goodBClist[var] = set(goodBClist[var])
for i in range(excess_axes_count):
fig.delaxes(axes.flatten()[-(i+1)])
########################################################### HBH genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
adata.obs["log_pct_counts_hb"] = np.log1p(adata.obs["pct_counts_hb"])
var = "log_pct_counts_hb"
n = 0
goodBClist[var] = []
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
mean = AdataTMP.obs[var].mean()
mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
max = mean+3*mad
axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
n = n+1
goodBClist[var] = set(goodBClist[var])
for i in range(excess_axes_count):
fig.delaxes(axes.flatten()[-(i+1)])