import scanpy as sc
import pandas as pd
from matplotlib import pylab
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import itertools
import numpy as np
import random
import anndata as ad
import hashlib
from scipy.sparse import  csr_matrix, issparse
from scipy import sparse

from matplotlib.colors import TwoSlopeNorm

import scanpy.external as sce
import sys


import anndata


anndata.__version__

'0.10.8'


sc.settings.set_figure_params(dpi=100, facecolor='white', dpi_save=500)
pylab.rcParams['figure.figsize'] = (9, 9)
homeDir = os.getenv("HOME")
sys.path.insert(1, homeDir+"/utils/")
from PlotPCA_components import *
from PurgeAdata import *


DS="Endpoints"


import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

/tmp/ipykernel_959918/1930539381.py:4: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter.
  anndata2ri.activate()


adata = sc.read_h5ad('./adatas/combined_scRNA_seq_data_with_genetic_demux_filtered_consistent.h5ad')
adata

AnnData object with n_obs × n_vars = 480500 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run'
    var: 'gene_ids', 'feature_types', 'genome'


pd.crosstab(adata.obs.condition, adata.obs.run)[[393, 394, 505]]


pd.crosstab(adata.obs.line, adata.obs.run)[[393, 394, 505]]


pd.crosstab(adata.obs.replicate, adata.obs.line)


pd.crosstab(adata.obs.replicate, adata.obs.condition).T


# Fill nas replicates

adata.obs["replicate"]  = adata.obs["replicate"].astype(str).replace("nan",1)

adata.obs["sample_id"] = adata.obs.replicate.astype(str) +"_" +adata.obs.line.astype(str)+"_" +adata.obs.run.astype(str)
adata.obs["groupCov"] = adata.obs["sample_id"]
adata.obs["harmonizedRegion"] =  "CBO"
adata.obs["tissue"] =  "CBO"
adata.obs["PCW"] = 50


adata

AnnData object with n_obs × n_vars = 480500 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW'
    var: 'gene_ids', 'feature_types', 'genome'

The Kernel crashed while executing code in the current cell or a previous cell. 

Please review the code in the cell(s) to identify a possible cause of the failure. 

Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. 

View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.


#min_cells = 3
min_genes = 100

print(adata.shape)
#adata = adata[:,(adata.X > 0 ).sum(axis = 0) > min_cells]
adata = adata[(adata.X > 0 ).sum(axis = 1) > min_genes]
print(adata.shape)

(480500, 36601)


import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

/tmp/ipykernel_518427/3313611837.py:4: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter.
  anndata2ri.activate()


%%R
library(scDblFinder)
set.seed(1)
library(BiocParallel)

[1] "value"


scDblFinder = rpy2.robjects.packages.importr('scDblFinder')
as_data_frame = rpy2.robjects.r['as.data.frame']



Xtab = pd.crosstab(adata.obs["sample_id"],adata.obs["condition"])
SingletsHash = hashlib.sha256(Xtab.sort_index(axis=1).to_csv(index=True).encode('utf-8')).hexdigest()
hashFile = "./adatas/Singlets.{}.xlsx".format(SingletsHash)
if os.path.isfile(hashFile):
    print(f"Precomputed singlets found, retrieving {SingletsHash}")
    Singlets = pd.read_excel(hashFile)[0].tolist()
else:
    Singlets = []
    for i in adata.obs["groupCov"].unique().tolist():
        adataLocal = adata[adata.obs["groupCov"] == i].copy()
        adataLocal.X = adataLocal.X.astype(np.float32)
        adataLocal.layers["counts"] = adataLocal.X.copy()
        adataLocal = PurgeAdata(adataLocal)
        del adataLocal.obs
        del adataLocal.var
        sce = anndata2ri.py2rpy(adataLocal)
        sce = scDblFinder.scDblFinder(sce)
        Singlets.extend(sce.obs_names[sce.obs["scDblFinder.class"] == "singlet"].tolist())
        print(f"Done with sample {i}")
    pd.Series(Singlets).to_excel(hashFile)

Done with sample 3_CTL08A_501
Done with sample 2_CTL08A_501
Done with sample 3_CTL04E_501
Done with sample 1_CTL04E_501
Done with sample 2_CTL04E_501
Done with sample 1_CTL08A_501
Done with sample 1_CTL08A_506
Done with sample 3_CTL04E_506
Done with sample 2_CTL08A_506
Done with sample 2_CTL04E_506
Done with sample 1_CTL04E_506
Done with sample 3_CTL08A_506
Done with sample 3_CTL08A_505
Done with sample 1_CTL08A_505
Done with sample 3_CTL04E_505
Done with sample 2_CTL04E_505
Done with sample 1_CTL04E_505
Done with sample 2_CTL08A_505
Done with sample 3_CTL04E_510
Done with sample 3_CTL08A_510
Done with sample 1_CTL04E_510
Done with sample 2_CTL08A_510
Done with sample 1_CTL08A_510
Done with sample 2_CTL04E_510
Done with sample 2_CTL04E_512
Done with sample 3_CTL08A_512
Done with sample 3_CTL04E_512
Done with sample 1_CTL08A_512
Done with sample 2_CTL08A_512
Done with sample 1_CTL04E_512
Done with sample 2_CTL08A_498
Done with sample 3_CTL08A_498
Done with sample 2_CTL04E_498
Done with sample 3_CTL04E_498
Done with sample 1_CTL04E_498
Done with sample 1_CTL08A_498
Done with sample 1_CTL08A_499
Done with sample 2_CTL08A_499
Done with sample 3_CTL04E_499
Done with sample 3_CTL08A_499
Done with sample 2_CTL04E_499
Done with sample 1_CTL04E_499
Done with sample 3_CTL08A_497
Done with sample 3_CTL04E_497
Done with sample 1_CTL08A_497
Done with sample 1_CTL04E_497
Done with sample 2_CTL08A_497
Done with sample 2_CTL04E_497
Done with sample 3_CTL04E_509
Done with sample 2_CTL04E_509
Done with sample 1_CTL08A_509
Done with sample 1_CTL04E_509
Done with sample 3_CTL08A_509
Done with sample 2_CTL08A_503
Done with sample 1_CTL08A_503
Done with sample 3_CTL04E_503
Done with sample 3_CTL08A_503
Done with sample 1_CTL04E_503
Done with sample 2_CTL04E_503
Done with sample 2_CTL08A_495
Done with sample 1_CTL08A_495
Done with sample 3_CTL04E_495
Done with sample 3_CTL08A_495
Done with sample 2_CTL04E_495
Done with sample 1_CTL04E_495
Done with sample 3_CTL08A_513
Done with sample 3_CTL04E_513
Done with sample 1_CTL04E_513
Done with sample 2_CTL08A_513
Done with sample 2_CTL04E_513
Done with sample 1_CTL08A_513
Done with sample 3_CTL08A_511
Done with sample 3_CTL04E_511
Done with sample 1_CTL08A_511
Done with sample 2_CTL04E_511
Done with sample 1_CTL04E_511
Done with sample 2_CTL08A_511
Done with sample 3_CTL08A_491
Done with sample 1_CTL04E_491
Done with sample 2_CTL08A_491
Done with sample 1_CTL08A_491
Done with sample 2_CTL04E_491
Done with sample 3_CTL04E_491
Done with sample 1_CTL04E_493
Done with sample 1_CTL08A_493
Done with sample 3_CTL04E_493
Done with sample 2_CTL08A_493
Done with sample 2_CTL04E_493
Done with sample 3_CTL08A_493
Done with sample 1_CTL08A_502
Done with sample 2_CTL04E_502
Done with sample 3_CTL08A_502
Done with sample 3_CTL04E_502
Done with sample 2_CTL08A_502
Done with sample 1_CTL04E_502
Done with sample 3_CTL04E_500
Done with sample 2_CTL04E_500
Done with sample 3_CTL08A_500
Done with sample 1_CTL08A_500
Done with sample 2_CTL08A_500
Done with sample 1_CTL04E_500
Done with sample 3_CTL04E_507
Done with sample 1_CTL08A_507
Done with sample 3_CTL08A_507
Done with sample 2_CTL08A_507
Done with sample 1_CTL04E_507
Done with sample 2_CTL04E_507
Done with sample 1_CTL08A_492
Done with sample 1_CTL04E_492
Done with sample 3_CTL08A_492
Done with sample 2_CTL04E_492
Done with sample 3_CTL04E_492
Done with sample 2_CTL08A_492
Done with sample 2_CTL04E_494
Done with sample 3_CTL08A_494
Done with sample 2_CTL08A_494
Done with sample 3_CTL04E_494
Done with sample 1_CTL04E_494
Done with sample 1_CTL08A_494
Done with sample 1_CTL08A_504
Done with sample 3_CTL04E_504
Done with sample 2_CTL04E_504
Done with sample 3_CTL08A_504
Done with sample 1_CTL04E_504
Done with sample 2_CTL08A_504
Done with sample 3_CTL08A_496
Done with sample 1_CTL08A_496
Done with sample 2_CTL04E_496
Done with sample 1_CTL04E_496
Done with sample 3_CTL04E_496
Done with sample 2_CTL08A_496
Done with sample 1_CTL08A_508
Done with sample 1_CTL04E_508
Done with sample 1_CTL08A_393
Done with sample 1_CTL01A_393
Done with sample 1_CTL04E_393
Done with sample 1_CTL02A_393
Done with sample 1_CTL01A_394
Done with sample 1_CTL04E_394
Done with sample 1_CTL08A_394
Done with sample 1_CTL02A_394


adata = adata[Singlets]
adata

View of AnnData object with n_obs × n_vars = 450811 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW'
    var: 'gene_ids', 'feature_types', 'genome'


# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)

/tmp/ipykernel_518427/3813780412.py:2: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
  adata.var["mt"] = adata.var_names.str.startswith("MT-")


FilterGroup = "groupCov"
import math
goodBClist = {}

########################################################### RIBO genes

fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())



var = "pct_counts_ribo"
n = 0
goodBClist[var] = []
for i in adata.obs[FilterGroup].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    max = mean+3*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
    axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
    goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
    n = n+1
goodBClist[var] = set(goodBClist[var])

for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])


########################################################### MITO genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
var = "pct_counts_mt"
goodBClist[var] = []
n = 0
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    max = mean+3*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
    axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
    goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
    n = n+1
goodBClist[var] = set(goodBClist[var])




for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])


########################################################### HBH genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
adata.obs["log_pct_counts_hb"] = np.log1p(adata.obs["pct_counts_hb"])
var = "log_pct_counts_hb"
n = 0
goodBClist[var] = []
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    max = mean+3*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
    axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
    goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
    n = n+1
goodBClist[var] = set(goodBClist[var])


for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])


adataPrefilt = adata[list(set.intersection(*list(goodBClist.values())))]
NumberGoodCells = adataPrefilt.shape[0]
GoodCellsRate = adataPrefilt.shape[0]/adata.shape[0]
print("Thresholding like this will results in {} cells kept".format(NumberGoodCells))
print("Which corresponde to {} cells rate".format(round(GoodCellsRate,2)))

Thresholding like this will results in 404517 cells kept
Which corresponde to 0.9 cells rate


# Axctual filtering
adata = adata[adataPrefilt.obs_names]
del adataPrefilt


import math
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
adata.obs["log_total_counts"] = np.log(adata.obs["total_counts"])
var = "log_total_counts"
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())
plt.subplots_adjust(hspace=1)

n = 0
goodBClist = []
for i in adata.obs.sort_values("run")["sample_id"].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    min = mean-2*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=mean, xmin=0, xmax=1, color="blue")
    axes.flatten()[n].axhline(y=min, xmin=0, xmax=1, color="green", linewidth =3)
    axes.flatten()[n].title.set_text("Sample {}".format(i))
    goodBClist.extend(adata.obs_names[(adata.obs[var] >= min) &  (adata.obs[FilterGroup] == i) ].tolist())
    n = n+1
    
for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])

/tmp/ipykernel_518427/234311030.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  adata.obs["log_total_counts"] = np.log(adata.obs["total_counts"])


NumberGoodCells = len(goodBClist)
GoodCellsRate = NumberGoodCells/adata.shape[0]
print("Thresholding like this will results in {} cells kept".format(NumberGoodCells))
print("Which corresponde to {} cells rate".format(round(GoodCellsRate,2)))

Thresholding like this will results in 378809 cells kept
Which corresponde to 0.94 cells rate


# Axctual filtering
adata = adata[goodBClist]


import math
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
var = "n_genes_by_counts"
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())
plt.subplots_adjust(hspace=1)

n = 0
goodBClist = []
for i in adata.obs.sort_values("run")["sample_id"].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    min = mean-2*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=mean, xmin=0, xmax=1, color="blue")
    axes.flatten()[n].axhline(y=min, xmin=0, xmax=1, color="green", linewidth =3)
    axes.flatten()[n].title.set_text("Sample {}".format(i))
    goodBClist.extend(adata.obs_names[(adata.obs[var] >= min) &  (adata.obs[FilterGroup] == i) ].tolist())
    n = n+1
    
for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])


NumberGoodCells = len(goodBClist)
GoodCellsRate = NumberGoodCells/adata.shape[0]
print("Thresholding like this will results in {} cells kept".format(NumberGoodCells))
print("Which corresponde to {} cells rate".format(round(GoodCellsRate,2)))

Thresholding like this will results in 369313 cells kept
Which corresponde to 0.97 cells rate


# Axctual filtering
adata = adata[goodBClist]


adata.obs["study"] = DS

/tmp/ipykernel_518427/670662693.py:1: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  adata.obs["study"] = DS


if issparse(adata.X)  == False:
    adata.X = csr_matrix(adata.X)
    print('Converted adata.X to', type(adata.X))


adata.obs["replicate"] = adata.obs["replicate"].astype(str)


adata.obs["condition"] = adata.obs.condition.astype(str).replace({"MTR_393_3GEX":"DMSO","MTR_394_3GEX":"ANDROGEN_AGONIST"})


adata.obs["condition"].unique().tolist()

['DMSO',
 'ANDROGEN_AGONIST',
 'ESTROGEN_AGONIST',
 'ESTROGEN_ANTAGONIST',
 'ANDROGEN_ANTAGONIST',
 'ARYL_HYD_AGONIST',
 'ARYL_HYD_ANTAGONIST',
 'LIVER-X_AGONIST',
 'LIVER-X_ANTAGONIST',
 'GLUCOCORT_AGONIST',
 'GLUCOCORT_ANTAGONIST',
 'THYROID_AGONIST',
 'THYROID_ANTAGONIST',
 '3-PBA',
 'DPHP',
 'BPA',
 'MEP',
 'RET_AGONIST',
 'RET_ANTAGONIST',
 'BPF',
 'MBzP',
 'TCP',
 'MIX']


adata.obs["hormones_substudy"] = np.where(adata.obs["condition"].isin([
    'DMSO', 'ANDROGEN_AGONIST', 'ESTROGEN_AGONIST', 'ESTROGEN_ANTAGONIST',
    'ANDROGEN_ANTAGONIST', 'ARYL_HYD_AGONIST', 'ARYL_HYD_ANTAGONIST',
    'LIVER-X_AGONIST', 'LIVER-X_ANTAGONIST', 'GLUCOCORT_AGONIST',
    'GLUCOCORT_ANTAGONIST', 'THYROID_AGONIST', 'THYROID_ANTAGONIST',
    'RET_AGONIST', 'RET_ANTAGONIST'
]) & (adata.obs["run"] != 393) & (adata.obs["run"] != 394), True,False)

adata.obs["EDCs_substudy"] = np.where(adata.obs["condition"].isin(['DMSO',
 '3-PBA',
 'DPHP',
 'BPA',
 'MEP',
 'BPF',
 'MBzP',
 'TCP',
 'MIX']) & (adata.obs["run"] != 393) & (adata.obs["run"] != 394), True,False)

adata.obs["androgen_substudy"] = np.where(adata.obs["run"].isin([393,394]), True,False)


adata.obs["controls_substudy"] = np.where(adata.obs["condition"].isin(['DMSO']) & (adata.obs["run"] != 393) & (adata.obs["run"] != 394), True,False)


print(adata[adata.obs["hormones_substudy"]])
print(adata[adata.obs["EDCs_substudy"]])
print(adata[adata.obs["androgen_substudy"]])
print(adata[adata.obs["controls_substudy"]])

View of AnnData object with n_obs × n_vars = 216055 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
View of AnnData object with n_obs × n_vars = 118207 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
View of AnnData object with n_obs × n_vars = 47648 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
View of AnnData object with n_obs × n_vars = 12597 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'


# Save GeneList and prebulk
adata.write_h5ad("./adatas/{}.full.h5ad".format(DS))
adata[adata.obs["hormones_substudy"]].write_h5ad("./adatas/3.{}.hormones_substudy.h5ad".format(DS))
adata[adata.obs["EDCs_substudy"]].write_h5ad("./adatas/3.{}.EDCs_substudy.h5ad".format(DS))
adata[adata.obs["androgen_substudy"]].write_h5ad("./adatas/3.{}.androgen_substudy.h5ad".format(DS))
adata[adata.obs["controls_substudy"]].write_h5ad("./adatas/3.{}.controls_substudy.h5ad".format(DS))

line	CTL04E	CTL08A
replicate
1	63080	69595
2	58454	66143
3	64737	73561

replicate	1	2	3
condition
3-PBA	5516	4976	5548
ANDROGEN_AGONIST	5775	4387	5627
ANDROGEN_ANTAGONIST	8005	7002	6906
ARYL_HYD_AGONIST	5856	5258	5615
ARYL_HYD_ANTAGONIST	8193	8478	8845
BPA	5333	4088	4798
BPF	5074	4936	5286
DMSO	5276	5977	6049
DPHP	7290	7285	7773
ESTROGEN_AGONIST	7631	7018	7223
ESTROGEN_ANTAGONIST	6069	6224	7296
GLUCOCORT_AGONIST	4866	4674	5506
GLUCOCORT_ANTAGONIST	6598	4527	5502
LIVER-X_AGONIST	5735	6047	6110
LIVER-X_ANTAGONIST	5869	5941	6942
MBzP	6543	6236	7592
MEP	5279	5038	5096
MIX	4899	6783	6816
RET_ANTAGONIST	6809	2748	6100
TCP	5943	5745	6464
THYROID_AGONIST	5236	4702	4381
THYROID_ANTAGONIST	4880	6527	6823

Import¶

basic prefitler¶

1) Remove doublets¶

2) Other QCs¶

MT and RP¶

Counts¶

n Genes¶

run	393	394	505
line
CTL01A	6163	7543	0
CTL02A	4599	5235	0
CTL04E	8407	10443	8292
CTL08A	6157	8809	9010