In [1]:
import scanpy as sc
import pandas as pd
from matplotlib import pylab
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import itertools
import numpy as np
import random
import anndata as ad
import hashlib
from scipy.sparse import  csr_matrix, issparse
from scipy import sparse

from matplotlib.colors import TwoSlopeNorm

import scanpy.external as sce
import sys
In [2]:
import anndata
In [3]:
anndata.__version__
Out[3]:
'0.10.8'
In [4]:
sc.settings.set_figure_params(dpi=100, facecolor='white', dpi_save=500)
pylab.rcParams['figure.figsize'] = (9, 9)
homeDir = os.getenv("HOME")
sys.path.insert(1, homeDir+"/utils/")
from PlotPCA_components import *
from PurgeAdata import *


DS="Endpoints"
In [5]:
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
/tmp/ipykernel_959918/1930539381.py:4: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter.
  anndata2ri.activate()

Import¶

In [6]:
adata = sc.read_h5ad('./adatas/combined_scRNA_seq_data_with_genetic_demux_filtered_consistent.h5ad')
adata
Out[6]:
AnnData object with n_obs × n_vars = 480500 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run'
    var: 'gene_ids', 'feature_types', 'genome'
In [7]:
pd.crosstab(adata.obs.condition, adata.obs.run)[[393, 394, 505]]
Out[7]:
run 393 394 505
condition
3-PBA 0 0 0
ANDROGEN_AGONIST 0 0 0
ANDROGEN_ANTAGONIST 0 0 0
ARYL_HYD_AGONIST 0 0 0
ARYL_HYD_ANTAGONIST 0 0 0
BPA 0 0 0
BPF 0 0 0
DMSO 0 0 17302
DPHP 0 0 0
ESTROGEN_AGONIST 0 0 0
ESTROGEN_ANTAGONIST 0 0 0
GLUCOCORT_AGONIST 0 0 0
GLUCOCORT_ANTAGONIST 0 0 0
LIVER-X_AGONIST 0 0 0
LIVER-X_ANTAGONIST 0 0 0
MBzP 0 0 0
MEP 0 0 0
MIX 0 0 0
MTR_393_3GEX 25326 0 0
MTR_394_3GEX 0 32030 0
RET_AGONIST 0 0 0
RET_ANTAGONIST 0 0 0
TCP 0 0 0
THYROID_AGONIST 0 0 0
THYROID_ANTAGONIST 0 0 0
In [8]:
pd.crosstab(adata.obs.line, adata.obs.run)[[393, 394, 505]]
Out[8]:
run 393 394 505
line
CTL01A 6163 7543 0
CTL02A 4599 5235 0
CTL04E 8407 10443 8292
CTL08A 6157 8809 9010
In [9]:
pd.crosstab(adata.obs.replicate, adata.obs.line)
Out[9]:
line CTL04E CTL08A
replicate
1 63080 69595
2 58454 66143
3 64737 73561
In [10]:
pd.crosstab(adata.obs.replicate, adata.obs.condition).T
Out[10]:
replicate 1 2 3
condition
3-PBA 5516 4976 5548
ANDROGEN_AGONIST 5775 4387 5627
ANDROGEN_ANTAGONIST 8005 7002 6906
ARYL_HYD_AGONIST 5856 5258 5615
ARYL_HYD_ANTAGONIST 8193 8478 8845
BPA 5333 4088 4798
BPF 5074 4936 5286
DMSO 5276 5977 6049
DPHP 7290 7285 7773
ESTROGEN_AGONIST 7631 7018 7223
ESTROGEN_ANTAGONIST 6069 6224 7296
GLUCOCORT_AGONIST 4866 4674 5506
GLUCOCORT_ANTAGONIST 6598 4527 5502
LIVER-X_AGONIST 5735 6047 6110
LIVER-X_ANTAGONIST 5869 5941 6942
MBzP 6543 6236 7592
MEP 5279 5038 5096
MIX 4899 6783 6816
RET_ANTAGONIST 6809 2748 6100
TCP 5943 5745 6464
THYROID_AGONIST 5236 4702 4381
THYROID_ANTAGONIST 4880 6527 6823
In [ ]:
# Fill nas replicates

adata.obs["replicate"]  = adata.obs["replicate"].astype(str).replace("nan",1)

adata.obs["sample_id"] = adata.obs.replicate.astype(str) +"_" +adata.obs.line.astype(str)+"_" +adata.obs.run.astype(str)
adata.obs["groupCov"] = adata.obs["sample_id"]
adata.obs["harmonizedRegion"] =  "CBO"
adata.obs["tissue"] =  "CBO"
adata.obs["PCW"] = 50
In [ ]:
adata
Out[ ]:
AnnData object with n_obs × n_vars = 480500 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW'
    var: 'gene_ids', 'feature_types', 'genome'
The Kernel crashed while executing code in the current cell or a previous cell. 

Please review the code in the cell(s) to identify a possible cause of the failure. 

Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. 

View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.

basic prefitler¶

In [ ]:
#min_cells = 3
min_genes = 100

print(adata.shape)
#adata = adata[:,(adata.X > 0 ).sum(axis = 0) > min_cells]
adata = adata[(adata.X > 0 ).sum(axis = 1) > min_genes]
print(adata.shape)
(480500, 36601)

1) Remove doublets¶

In [ ]:
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
/tmp/ipykernel_518427/3313611837.py:4: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter.
  anndata2ri.activate()
In [ ]:
%%R
library(scDblFinder)
set.seed(1)
library(BiocParallel)
[1] "value"
In [ ]:
scDblFinder = rpy2.robjects.packages.importr('scDblFinder')
as_data_frame = rpy2.robjects.r['as.data.frame']



Xtab = pd.crosstab(adata.obs["sample_id"],adata.obs["condition"])
SingletsHash = hashlib.sha256(Xtab.sort_index(axis=1).to_csv(index=True).encode('utf-8')).hexdigest()
hashFile = "./adatas/Singlets.{}.xlsx".format(SingletsHash)
if os.path.isfile(hashFile):
    print(f"Precomputed singlets found, retrieving {SingletsHash}")
    Singlets = pd.read_excel(hashFile)[0].tolist()
else:
    Singlets = []
    for i in adata.obs["groupCov"].unique().tolist():
        adataLocal = adata[adata.obs["groupCov"] == i].copy()
        adataLocal.X = adataLocal.X.astype(np.float32)
        adataLocal.layers["counts"] = adataLocal.X.copy()
        adataLocal = PurgeAdata(adataLocal)
        del adataLocal.obs
        del adataLocal.var
        sce = anndata2ri.py2rpy(adataLocal)
        sce = scDblFinder.scDblFinder(sce)
        Singlets.extend(sce.obs_names[sce.obs["scDblFinder.class"] == "singlet"].tolist())
        print(f"Done with sample {i}")
    pd.Series(Singlets).to_excel(hashFile)
Done with sample 3_CTL08A_501
Done with sample 2_CTL08A_501
Done with sample 3_CTL04E_501
Done with sample 1_CTL04E_501
Done with sample 2_CTL04E_501
Done with sample 1_CTL08A_501
Done with sample 1_CTL08A_506
Done with sample 3_CTL04E_506
Done with sample 2_CTL08A_506
Done with sample 2_CTL04E_506
Done with sample 1_CTL04E_506
Done with sample 3_CTL08A_506
Done with sample 3_CTL08A_505
Done with sample 1_CTL08A_505
Done with sample 3_CTL04E_505
Done with sample 2_CTL04E_505
Done with sample 1_CTL04E_505
Done with sample 2_CTL08A_505
Done with sample 3_CTL04E_510
Done with sample 3_CTL08A_510
Done with sample 1_CTL04E_510
Done with sample 2_CTL08A_510
Done with sample 1_CTL08A_510
Done with sample 2_CTL04E_510
Done with sample 2_CTL04E_512
Done with sample 3_CTL08A_512
Done with sample 3_CTL04E_512
Done with sample 1_CTL08A_512
Done with sample 2_CTL08A_512
Done with sample 1_CTL04E_512
Done with sample 2_CTL08A_498
Done with sample 3_CTL08A_498
Done with sample 2_CTL04E_498
Done with sample 3_CTL04E_498
Done with sample 1_CTL04E_498
Done with sample 1_CTL08A_498
Done with sample 1_CTL08A_499
Done with sample 2_CTL08A_499
Done with sample 3_CTL04E_499
Done with sample 3_CTL08A_499
Done with sample 2_CTL04E_499
Done with sample 1_CTL04E_499
Done with sample 3_CTL08A_497
Done with sample 3_CTL04E_497
Done with sample 1_CTL08A_497
Done with sample 1_CTL04E_497
Done with sample 2_CTL08A_497
Done with sample 2_CTL04E_497
Done with sample 3_CTL04E_509
Done with sample 2_CTL04E_509
Done with sample 1_CTL08A_509
Done with sample 1_CTL04E_509
Done with sample 3_CTL08A_509
Done with sample 2_CTL08A_503
Done with sample 1_CTL08A_503
Done with sample 3_CTL04E_503
Done with sample 3_CTL08A_503
Done with sample 1_CTL04E_503
Done with sample 2_CTL04E_503
Done with sample 2_CTL08A_495
Done with sample 1_CTL08A_495
Done with sample 3_CTL04E_495
Done with sample 3_CTL08A_495
Done with sample 2_CTL04E_495
Done with sample 1_CTL04E_495
Done with sample 3_CTL08A_513
Done with sample 3_CTL04E_513
Done with sample 1_CTL04E_513
Done with sample 2_CTL08A_513
Done with sample 2_CTL04E_513
Done with sample 1_CTL08A_513
Done with sample 3_CTL08A_511
Done with sample 3_CTL04E_511
Done with sample 1_CTL08A_511
Done with sample 2_CTL04E_511
Done with sample 1_CTL04E_511
Done with sample 2_CTL08A_511
Done with sample 3_CTL08A_491
Done with sample 1_CTL04E_491
Done with sample 2_CTL08A_491
Done with sample 1_CTL08A_491
Done with sample 2_CTL04E_491
Done with sample 3_CTL04E_491
Done with sample 1_CTL04E_493
Done with sample 1_CTL08A_493
Done with sample 3_CTL04E_493
Done with sample 2_CTL08A_493
Done with sample 2_CTL04E_493
Done with sample 3_CTL08A_493
Done with sample 1_CTL08A_502
Done with sample 2_CTL04E_502
Done with sample 3_CTL08A_502
Done with sample 3_CTL04E_502
Done with sample 2_CTL08A_502
Done with sample 1_CTL04E_502
Done with sample 3_CTL04E_500
Done with sample 2_CTL04E_500
Done with sample 3_CTL08A_500
Done with sample 1_CTL08A_500
Done with sample 2_CTL08A_500
Done with sample 1_CTL04E_500
Done with sample 3_CTL04E_507
Done with sample 1_CTL08A_507
Done with sample 3_CTL08A_507
Done with sample 2_CTL08A_507
Done with sample 1_CTL04E_507
Done with sample 2_CTL04E_507
Done with sample 1_CTL08A_492
Done with sample 1_CTL04E_492
Done with sample 3_CTL08A_492
Done with sample 2_CTL04E_492
Done with sample 3_CTL04E_492
Done with sample 2_CTL08A_492
Done with sample 2_CTL04E_494
Done with sample 3_CTL08A_494
Done with sample 2_CTL08A_494
Done with sample 3_CTL04E_494
Done with sample 1_CTL04E_494
Done with sample 1_CTL08A_494
Done with sample 1_CTL08A_504
Done with sample 3_CTL04E_504
Done with sample 2_CTL04E_504
Done with sample 3_CTL08A_504
Done with sample 1_CTL04E_504
Done with sample 2_CTL08A_504
Done with sample 3_CTL08A_496
Done with sample 1_CTL08A_496
Done with sample 2_CTL04E_496
Done with sample 1_CTL04E_496
Done with sample 3_CTL04E_496
Done with sample 2_CTL08A_496
Done with sample 1_CTL08A_508
Done with sample 1_CTL04E_508
Done with sample 1_CTL08A_393
Done with sample 1_CTL01A_393
Done with sample 1_CTL04E_393
Done with sample 1_CTL02A_393
Done with sample 1_CTL01A_394
Done with sample 1_CTL04E_394
Done with sample 1_CTL08A_394
Done with sample 1_CTL02A_394
In [21]:
adata = adata[Singlets]
adata
Out[21]:
View of AnnData object with n_obs × n_vars = 450811 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW'
    var: 'gene_ids', 'feature_types', 'genome'

2) Other QCs¶

In [22]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
)
/tmp/ipykernel_518427/3813780412.py:2: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
  adata.var["mt"] = adata.var_names.str.startswith("MT-")

MT and RP¶

In [23]:
FilterGroup = "groupCov"
import math
goodBClist = {}

########################################################### RIBO genes

fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())



var = "pct_counts_ribo"
n = 0
goodBClist[var] = []
for i in adata.obs[FilterGroup].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    max = mean+3*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
    axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
    goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
    n = n+1
goodBClist[var] = set(goodBClist[var])

for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])


########################################################### MITO genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
var = "pct_counts_mt"
goodBClist[var] = []
n = 0
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    max = mean+3*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
    axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
    goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
    n = n+1
goodBClist[var] = set(goodBClist[var])




for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])


########################################################### HBH genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
adata.obs["log_pct_counts_hb"] = np.log1p(adata.obs["pct_counts_hb"])
var = "log_pct_counts_hb"
n = 0
goodBClist[var] = []
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    max = mean+3*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
    axes.flatten()[n].title.set_text("Sample {} {}".format(i, var))
    goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
    n = n+1
goodBClist[var] = set(goodBClist[var])


for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])
In [24]:
adataPrefilt = adata[list(set.intersection(*list(goodBClist.values())))]
NumberGoodCells = adataPrefilt.shape[0]
GoodCellsRate = adataPrefilt.shape[0]/adata.shape[0]
print("Thresholding like this will results in {} cells kept".format(NumberGoodCells))
print("Which corresponde to {} cells rate".format(round(GoodCellsRate,2)))
Thresholding like this will results in 404517 cells kept
Which corresponde to 0.9 cells rate
In [25]:
# Axctual filtering
adata = adata[adataPrefilt.obs_names]
del adataPrefilt

Counts¶

In [26]:
import math
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
adata.obs["log_total_counts"] = np.log(adata.obs["total_counts"])
var = "log_total_counts"
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())
plt.subplots_adjust(hspace=1)

n = 0
goodBClist = []
for i in adata.obs.sort_values("run")["sample_id"].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    min = mean-2*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=mean, xmin=0, xmax=1, color="blue")
    axes.flatten()[n].axhline(y=min, xmin=0, xmax=1, color="green", linewidth =3)
    axes.flatten()[n].title.set_text("Sample {}".format(i))
    goodBClist.extend(adata.obs_names[(adata.obs[var] >= min) &  (adata.obs[FilterGroup] == i) ].tolist())
    n = n+1
    
for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])
/tmp/ipykernel_518427/234311030.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  adata.obs["log_total_counts"] = np.log(adata.obs["total_counts"])
In [27]:
NumberGoodCells = len(goodBClist)
GoodCellsRate = NumberGoodCells/adata.shape[0]
print("Thresholding like this will results in {} cells kept".format(NumberGoodCells))
print("Which corresponde to {} cells rate".format(round(GoodCellsRate,2)))
Thresholding like this will results in 378809 cells kept
Which corresponde to 0.94 cells rate
In [28]:
# Axctual filtering
adata = adata[goodBClist]

n Genes¶

In [29]:
import math
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
var = "n_genes_by_counts"
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())
plt.subplots_adjust(hspace=1)

n = 0
goodBClist = []
for i in adata.obs.sort_values("run")["sample_id"].unique():
    AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
    mean =  AdataTMP.obs[var].mean()
    mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
    min = mean-2*mad
    axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
    axes.flatten()[n].axhline(y=mean, xmin=0, xmax=1, color="blue")
    axes.flatten()[n].axhline(y=min, xmin=0, xmax=1, color="green", linewidth =3)
    axes.flatten()[n].title.set_text("Sample {}".format(i))
    goodBClist.extend(adata.obs_names[(adata.obs[var] >= min) &  (adata.obs[FilterGroup] == i) ].tolist())
    n = n+1
    
for i in range(excess_axes_count):
    fig.delaxes(axes.flatten()[-(i+1)])
In [30]:
NumberGoodCells = len(goodBClist)
GoodCellsRate = NumberGoodCells/adata.shape[0]
print("Thresholding like this will results in {} cells kept".format(NumberGoodCells))
print("Which corresponde to {} cells rate".format(round(GoodCellsRate,2)))
Thresholding like this will results in 369313 cells kept
Which corresponde to 0.97 cells rate
In [31]:
# Axctual filtering
adata = adata[goodBClist]
In [32]:
adata.obs["study"] = DS
/tmp/ipykernel_518427/670662693.py:1: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
  adata.obs["study"] = DS
In [33]:
if issparse(adata.X)  == False:
    adata.X = csr_matrix(adata.X)
    print('Converted adata.X to', type(adata.X))
In [34]:
adata.obs["replicate"] = adata.obs["replicate"].astype(str)
In [35]:
adata.obs["condition"] = adata.obs.condition.astype(str).replace({"MTR_393_3GEX":"DMSO","MTR_394_3GEX":"ANDROGEN_AGONIST"})
In [36]:
adata.obs["condition"].unique().tolist()
Out[36]:
['DMSO',
 'ANDROGEN_AGONIST',
 'ESTROGEN_AGONIST',
 'ESTROGEN_ANTAGONIST',
 'ANDROGEN_ANTAGONIST',
 'ARYL_HYD_AGONIST',
 'ARYL_HYD_ANTAGONIST',
 'LIVER-X_AGONIST',
 'LIVER-X_ANTAGONIST',
 'GLUCOCORT_AGONIST',
 'GLUCOCORT_ANTAGONIST',
 'THYROID_AGONIST',
 'THYROID_ANTAGONIST',
 '3-PBA',
 'DPHP',
 'BPA',
 'MEP',
 'RET_AGONIST',
 'RET_ANTAGONIST',
 'BPF',
 'MBzP',
 'TCP',
 'MIX']
In [41]:
adata.obs["hormones_substudy"] = np.where(adata.obs["condition"].isin([
    'DMSO', 'ANDROGEN_AGONIST', 'ESTROGEN_AGONIST', 'ESTROGEN_ANTAGONIST',
    'ANDROGEN_ANTAGONIST', 'ARYL_HYD_AGONIST', 'ARYL_HYD_ANTAGONIST',
    'LIVER-X_AGONIST', 'LIVER-X_ANTAGONIST', 'GLUCOCORT_AGONIST',
    'GLUCOCORT_ANTAGONIST', 'THYROID_AGONIST', 'THYROID_ANTAGONIST',
    'RET_AGONIST', 'RET_ANTAGONIST'
]) & (adata.obs["run"] != 393) & (adata.obs["run"] != 394), True,False)

adata.obs["EDCs_substudy"] = np.where(adata.obs["condition"].isin(['DMSO',
 '3-PBA',
 'DPHP',
 'BPA',
 'MEP',
 'BPF',
 'MBzP',
 'TCP',
 'MIX']) & (adata.obs["run"] != 393) & (adata.obs["run"] != 394), True,False)

adata.obs["androgen_substudy"] = np.where(adata.obs["run"].isin([393,394]), True,False)


adata.obs["controls_substudy"] = np.where(adata.obs["condition"].isin(['DMSO']) & (adata.obs["run"] != 393) & (adata.obs["run"] != 394), True,False)
In [42]:
print(adata[adata.obs["hormones_substudy"]])
print(adata[adata.obs["EDCs_substudy"]])
print(adata[adata.obs["androgen_substudy"]])
print(adata[adata.obs["controls_substudy"]])
View of AnnData object with n_obs × n_vars = 216055 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
View of AnnData object with n_obs × n_vars = 118207 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
View of AnnData object with n_obs × n_vars = 47648 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
View of AnnData object with n_obs × n_vars = 12597 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
In [43]:
# Save GeneList and prebulk
adata.write_h5ad("./adatas/{}.full.h5ad".format(DS))
adata[adata.obs["hormones_substudy"]].write_h5ad("./adatas/3.{}.hormones_substudy.h5ad".format(DS))
adata[adata.obs["EDCs_substudy"]].write_h5ad("./adatas/3.{}.EDCs_substudy.h5ad".format(DS))
adata[adata.obs["androgen_substudy"]].write_h5ad("./adatas/3.{}.androgen_substudy.h5ad".format(DS))
adata[adata.obs["controls_substudy"]].write_h5ad("./adatas/3.{}.controls_substudy.h5ad".format(DS))