import scanpy as sc
import pandas as pd
from matplotlib import pylab
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import itertools
import numpy as np
import random
import anndata as ad
from scipy.sparse import csr_matrix, issparse
from scipy import sparse
from matplotlib.colors import TwoSlopeNorm
import scanpy.external as sce
import sys
import anndata
anndata.__version__
'0.10.8'
sc.settings.set_figure_params(dpi=100, facecolor='white', dpi_save=500)
pylab.rcParams['figure.figsize'] = (9, 9)
homeDir = os.getenv("HOME")
sys.path.insert(1, homeDir+"/utils/")
from PlotPCA_components import *
from PurgeAdata import *
DS="androgen_substudy"
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
anndata2ri.activate()
%load_ext rpy2.ipython
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
The rpy2.ipython extension is already loaded. To reload it, use: %reload_ext rpy2.ipython
/tmp/ipykernel_546255/1930539381.py:4: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter. anndata2ri.activate()
random.seed(1)
adata = sc.read_h5ad("./adatas/3.Endpoints.{}.h5ad".format(DS))
adata
AnnData object with n_obs × n_vars = 47648 × 36601 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy' var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
adata = adata[adata.obs[DS]]
adata
View of AnnData object with n_obs × n_vars = 47648 × 36601 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy' var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
adata.layers["counts"] = adata.X.copy()
/tmp/ipykernel_546255/1517723426.py:1: ImplicitModificationWarning: Setting element `.layers['counts']` of view, initializing view as actual. adata.layers["counts"] = adata.X.copy()
from goatools.go_search import GoSearch
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.cli.ncbi_gene_results_to_python import ncbi_tsv_to_py
from goatools.base import download_ncbi_associations
gene2go = download_ncbi_associations()
objanno = Gene2GoReader("gene2go", taxids=[9606])
go2geneids_human = objanno.get_id2gos(namespace='BP', go2geneids=True)
srchhelp = GoSearch("./resources/go-basic.obo", go2items=go2geneids_human)
ncbi_tsv = '/group/testa/Users/davide.castaldi/ENDPOINTS_sc/resources/gene_result.txt'
output_py = 'genes_ncbi_9606_proteincoding.py'
ncbi_tsv_to_py(ncbi_tsv, output_py)
from goatools.associations import read_ncbi_gene2go
gene2go = read_ncbi_gene2go("./resources/gene2go", taxids=[9606]) # Taxonomy ID for Homo sapiens
from collections import defaultdict
# Specify the GO term of interest
RelevantOntologies = {"GO:0034976":"ER_stress",
"GO:0001666":"Hypoxia",
"GO:0097300":"programmed_necrotic_cell_death",
"GO:0062098":"regulation_of_programmed_necrotic_cell_death",
"GO:0097385":"programmed_necrotic_cell_death_in_response_to_starvation",
"GO:0062100":"positive_regulation_of_programmed_necrotic_cell_death",
"GO:0062099":"negative_regulation_of_programmed_necrotic_cell_death",
"GO:1902445":"regulation_of_mitochondrial_membrane_permeability_involved_in_programmed_necrotic_cell_death",
"GO:0070266":"necroptotic_process"}
go_to_genes = defaultdict(set)
RelevantGO_dict = defaultdict(set)
for gene_id, go_terms in gene2go.items():
for go_term in go_terms:
if go_term in list(RelevantOntologies.keys()):
RelevantGO_dict[go_term].add(gene_id)
EXISTS: gene2go HMS:0:01:26.628789 362,876 annotations, 20,818 genes, 18,767 GOs, 1 taxids READ: gene2go 12230 IDs in loaded association branch, BP ./resources/go-basic.obo: fmt(1.2) rel(2024-10-27) 44,017 Terms; optional_attrs(comment def relationship synonym xref) 20,614 lines READ: /group/testa/Users/davide.castaldi/ENDPOINTS_sc/resources/gene_result.txt 20,595 geneids WROTE: genes_ncbi_9606_proteincoding.py DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader DEPRECATED read_ncbi_gene2go CALLED FROM: /tmp/ipykernel_546255/1653780176.py BY <module> HMS:0:01:22.133481 362,876 annotations, 20,818 genes, 18,767 GOs, 1 taxids READ: ./resources/gene2go 18744 IDs in loaded association branch, BP
from genes_ncbi_9606_proteincoding import GENEID2NT
RelevantGO_dict_Symbols = {}
for ontology in list(RelevantGO_dict.keys()):
RelevantGO_dict_Symbols[RelevantOntologies[ontology]] = []
genes = RelevantGO_dict[ontology]
print("Genes associated with GO term {} ({}) in humans:".format(ontology,RelevantOntologies[ontology]))
for gene_id in genes:
if GENEID2NT.get(gene_id, None):
Symbol = GENEID2NT.get(gene_id, None).Symbol
RelevantGO_dict_Symbols[RelevantOntologies[ontology]].append(Symbol)
if len(RelevantGO_dict_Symbols[RelevantOntologies[ontology]]) < 5:
RelevantGO_dict_Symbols.pop(RelevantOntologies[ontology], None)
print("{} having too few annotated genes, it will be removed".format(RelevantOntologies[ontology]))
else:
print(RelevantGO_dict_Symbols[RelevantOntologies[ontology]])
print("\n")
Genes associated with GO term GO:0001666 (Hypoxia) in humans: ['SOX4', 'CYP1A1', 'EPO', 'HSP90B1', 'ABAT', 'HIF1A', 'ERCC2', 'ERCC3', 'HK2', 'PSEN2', 'TM9SF4', 'CD24', 'MB', 'PENK', 'SRF', 'CYGB', 'FUNDC1', 'ITGA2', 'HMOX2', 'ACVRL1', 'ADA', 'ACE', 'PLEKHN1', 'ALKBH5', 'F7', 'MECP2', 'PGF', 'CHRNA4', 'CHRNA7', 'CHRNB2', 'RYR1', 'RYR2', 'ITPR1', 'ITPR2', 'FZD4', 'CITED2', 'ADM', 'ADORA1', 'PTK2B', 'BMP2', 'LONP1', 'BNIP3', 'ADIPOQ', 'NF1', 'KCNA5', 'ADSL', 'SOD2', 'CXCR4', 'SOD3', 'AGER', 'KCNJ8', 'PIN1', 'KCNJ11', 'UCP2', 'UCP3', 'ARNT2', 'PKLR', 'KCNMA1', 'DIO3', 'PLAT', 'PLAU', 'ALAD', 'ALAS2', 'ADAM17', 'SCFD1', 'MMP2', 'HSD11B2', 'USF1', 'MMP14', 'TXN2', 'VEGFD', 'PLOD1', 'PLOD2', 'NOS1', 'NOS2', 'CXCL12', 'VCAM1', 'PML', 'RAD21', 'VEGFA', 'VEGFB', 'CA9', 'VEGFC', 'AGTRAP', 'DPP4', 'HSPG2', 'DDIT4', 'EGLN2', 'EGLN3', 'NPPC', 'DRD2', 'ANG', 'SLC11A2', 'ANGPT2', 'COMT', 'LIMD1', 'NOL3', 'SLC2A8', 'NGB', 'FOSL2', 'EGLN1', 'CAPN2', 'APAF1', 'NR4A2', 'CASP3', 'WDR83', 'BIRC2', 'CASP9', 'ABCC9', 'CAT', 'CLDN3', 'CAV1', 'PPARA', 'CAV3', 'REST', 'XRCC1', 'CPT1A', 'CBFA2T3', 'TEK', 'CREBBP', 'TFAM', 'LEP', 'SLC2A1', 'EDNRA', 'TFRC', 'CRYAB', 'TGFB2', 'SLC6A4', 'TGFB3', 'POSTN', 'LIF', 'TGFBR3', 'TH', 'THBS1', 'ARNT', 'MT3', 'SCAP', 'MT-CO1', 'MT-CO2', 'P2RX3', 'PDLIM1', 'EGR1', 'MT-CYB', 'WTIP', 'P2RX2', 'MTHFR', 'CIAO3', 'ASCL2', 'LOXL2', 'MT-ND1', 'CD38', 'MT-ND2', 'MT-ND4', 'PRKAA1', 'MT-ND5', 'TLR2', 'ANGPTL4', 'TRPV4', 'LTA', 'MYOCD', 'TNF', 'ATM', 'IL1A', 'AJUBA', 'ENG', 'BECN1', 'EP300', 'EPAS1', 'SMAD3', 'SMAD4', 'MYB'] Genes associated with GO term GO:0034976 (ER_stress) in humans: ['UBA5', 'NHLRC1', 'HSP90B1', 'MBTPS1', 'TRAF2', 'ABL1', 'CEBPB', 'ERP44', 'ERN1', 'CFTR', 'TTC23L', 'BBS1', 'NCK1', 'QRICH1', 'TNFRSF10B', 'DNAJB9', 'PPP1R15A', 'ANKS4B', 'DDIT3', 'USP19', 'MAP3K5', 'NOD2', 'TMX1', 'CREBRF', 'JUN', 'NOD1', 'DNAJC10', 'MBTPS2', 'ERP27', 'TMTC3', 'TMEM259', 'SCAMP5', 'PIK3R1', 'PIK3R2', 'KCNJ8', 'EIF2B5', 'SEC16A', 'PDIA2', 'OS9', 'SRPX', 'CREB3L3', 'SGF29', 'TMEM258', 'EIF2AK3', 'HSPA5', 'ALOX15', 'CREB3L2', 'NIBAN1', 'UBQLN1', 'HYOU1', 'NRBF2', 'BCL2L11', 'RASGRF1', 'RASGRF2', 'WFS1', 'ERO1A', 'UFC1', 'XBP1', 'UFL1', 'RNF183', 'ERN2', 'CERT1', 'PDIA3', 'ATF6B', 'UFM1', 'CREB3L1', 'TMEM33', 'BBS10', 'PDIA4', 'TARDBP', 'PPP2CB', 'ATF6', 'THBS1', 'PDIA6', 'FICD', 'THBS4', 'GORASP2', 'BCAP31', 'CDK5RAP3', 'TRIB3', 'P4HB', 'EIF2S1', 'PPP1R15B', 'DDRGK1', 'ELAVL4', 'PRKN', 'ATF3', 'ATF4', 'FLOT1', 'ATP2A1', 'ATP2A2', 'BBC3', 'HERPUD1', 'MARCKS', 'AFF4', 'CXCL8'] Genes associated with GO term GO:0070266 (necroptotic_process) in humans: ['RIPK1', 'SPATA2', 'FASLG', 'CYLD', 'TRPM7', 'PPIF', 'BIRC2', 'MLKL', 'PYGL', 'PGAM5', 'TP53', 'IPMK', 'ITPK1', 'RIPK3', 'NLRP6'] Genes associated with GO term GO:0062098 (regulation_of_programmed_necrotic_cell_death) in humans: regulation_of_programmed_necrotic_cell_death having too few annotated genes, it will be removed Genes associated with GO term GO:1902445 (regulation_of_mitochondrial_membrane_permeability_involved_in_programmed_necrotic_cell_death) in humans: regulation_of_mitochondrial_membrane_permeability_involved_in_programmed_necrotic_cell_death having too few annotated genes, it will be removed Genes associated with GO term GO:0062100 (positive_regulation_of_programmed_necrotic_cell_death) in humans: positive_regulation_of_programmed_necrotic_cell_death having too few annotated genes, it will be removed Genes associated with GO term GO:0097300 (programmed_necrotic_cell_death) in humans: ['RIPK1', 'ATG9A', 'IRF3', 'NINJ1', 'TRAF2', 'ATG9B', 'MAP3K5', 'RIPK3'] Genes associated with GO term GO:0062099 (negative_regulation_of_programmed_necrotic_cell_death) in humans: negative_regulation_of_programmed_necrotic_cell_death having too few annotated genes, it will be removed
# Quick prep of data
sc.pp.filter_genes(adata,min_cells=1)
sc.pp.normalize_total(adata, 20000)
sc.pp.log1p(adata)
sc.pp.scale(adata)
/tmp/ipykernel_546255/3224423605.py:3: FutureWarning: The specified parameters ('target_sum',) are no longer positional. Please specify them like `target_sum=20000` sc.pp.normalize_total(adata, 20000)
# Since score genes is not consistent even setting the same seed we hash-store the result after running it for consistency
import hashlib
for k in list(RelevantGO_dict_Symbols.keys()):
CommonGenes = [i for i in RelevantGO_dict_Symbols[k] if i in adata.var_names]
GOscoreHash = hashlib.sha256(pd.DataFrame(np.dot(adata[:,CommonGenes].X.T, pd.get_dummies(adata.obs["condition"])).sum(axis = 0)).to_csv(index=False).encode('utf-8')).hexdigest()
hashFile = "./adatas/Score_{}.{}.{}.xlsx".format(k,DS,GOscoreHash)
if os.path.isfile(hashFile):
print(f"Precomputed score found, retrieving {hashFile}")
Score = pd.read_excel(hashFile, index_col=0)
else:
print(f"Computing score for {k}")
Score = sc.tl.score_genes(adata, gene_list=CommonGenes, score_name="ScoreSignature_{}".format(k),random_state=1,copy=True).obs["ScoreSignature_{}".format(k)]
Score.to_excel(hashFile)
adata.obs["ScoreSignature_{}".format(k)] = Score
Computing score for Hypoxia Computing score for ER_stress Computing score for necroptotic_process Computing score for programmed_necrotic_cell_death
adata.X = adata.layers["counts"].copy()
import seaborn as sns
import matplotlib.pyplot as plt
# Set the theme
sns.set_theme(style="whitegrid")
# Identify the signatures
signatures = [s for s in adata.obs.columns if "ScoreSignature_" in s]
# Set up subplots
n_cols = 2 # Number of columns
n_rows = len(signatures) // n_cols + (len(signatures) % n_cols > 0) # Number of rows needed
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5 * n_rows), constrained_layout=True)
# Flatten the axes for easier indexing
axes = axes.flatten()
# Plot each signature
for n, signature in enumerate(signatures):
sns.violinplot(
data=adata.obs,inner=None,
x="condition", y=signature, hue="replicate", palette="dark", alpha=.6, ax=axes[n]
)
axes[n].set_title(signature)
axes[n].set_ylabel("Score") # Set y-axis label
axes[n].tick_params(axis="x", rotation=90) # Rotate x-axis tick labels
# Remove unused axes
for i in range(len(signatures), len(axes)):
fig.delaxes(axes[i])
# Show the plot
plt.show()
adata
AnnData object with n_obs × n_vars = 47648 × 33025 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy', 'ScoreSignature_Hypoxia', 'ScoreSignature_ER_stress', 'ScoreSignature_necroptotic_process', 'ScoreSignature_programmed_necrotic_cell_death' var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std' uns: 'log1p' layers: 'counts'
adata = adata[(adata.obs["ScoreSignature_Hypoxia"] < 0.4) &
(adata.obs["ScoreSignature_ER_stress"] < 0.5) &
(adata.obs["ScoreSignature_necroptotic_process"] < 5) &
(adata.obs["ScoreSignature_programmed_necrotic_cell_death"] < 5)].copy()
adata
AnnData object with n_obs × n_vars = 47245 × 33025 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy', 'ScoreSignature_Hypoxia', 'ScoreSignature_ER_stress', 'ScoreSignature_necroptotic_process', 'ScoreSignature_programmed_necrotic_cell_death' var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std' uns: 'log1p' layers: 'counts'
adata = adata[(adata.obs["ScoreSignature_Hypoxia"] < 0.4) &
(adata.obs["ScoreSignature_ER_stress"] < 0.5) &
(adata.obs["ScoreSignature_necroptotic_process"] < 5) &
(adata.obs["ScoreSignature_programmed_necrotic_cell_death"] < 5)].copy()
adata
AnnData object with n_obs × n_vars = 47245 × 33025 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy', 'ScoreSignature_Hypoxia', 'ScoreSignature_ER_stress', 'ScoreSignature_necroptotic_process', 'ScoreSignature_programmed_necrotic_cell_death' var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std' uns: 'log1p' layers: 'counts'
import seaborn as sns
import matplotlib.pyplot as plt
# Set the theme
sns.set_theme(style="whitegrid")
# Identify the signatures
signatures = [s for s in adata.obs.columns if "ScoreSignature_" in s]
# Set up subplots
n_cols = 2 # Number of columns
n_rows = len(signatures) // n_cols + (len(signatures) % n_cols > 0) # Number of rows needed
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5 * n_rows), constrained_layout=True)
# Flatten the axes for easier indexing
axes = axes.flatten()
# Plot each signature
for n, signature in enumerate(signatures):
sns.violinplot(
data=adata.obs,inner=None,
x="condition", y=signature, hue="replicate", palette="dark", alpha=.6, ax=axes[n]
)
axes[n].set_title(signature)
axes[n].set_ylabel("Score") # Set y-axis label
axes[n].tick_params(axis="x", rotation=90) # Rotate x-axis tick labels
# Remove unused axes
for i in range(len(signatures), len(axes)):
fig.delaxes(axes[i])
# Show the plot
plt.show()
adata
AnnData object with n_obs × n_vars = 47245 × 33025 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run', 'sample_id', 'groupCov', 'harmonizedRegion', 'tissue', 'PCW', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'log_pct_counts_hb', 'log_total_counts', 'study', 'hormones_substudy', 'EDCs_substudy', 'androgen_substudy', 'controls_substudy', 'ScoreSignature_Hypoxia', 'ScoreSignature_ER_stress', 'ScoreSignature_necroptotic_process', 'ScoreSignature_programmed_necrotic_cell_death' var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'mean', 'std' uns: 'log1p' layers: 'counts'
## It seems that GO:0097300 (necrosis) got sharp bimodal distribution, let's forecast a filtering result on > 0
(adata.obs["ScoreSignature_programmed_necrotic_cell_death"] < 0).sum()
36521
FilterGroup = "condition"
import math
goodBClist = {}
########################################################### RIBO genes
#adata = adata.obs["condition"].isin(["ESTROGEN_ANTAGONIST","RET_AGONIST"])
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 1*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
excess_axes_count = len(axes.flatten()) - len(adata.obs[FilterGroup].unique())
var = "ScoreSignature_Hypoxia"
n = 0
goodBClist[var] = []
for i in adata.obs[FilterGroup].unique():
AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
mean = AdataTMP.obs[var].mean()
mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
max = mean+3*mad
axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
axes.flatten()[n].title.set_text("Sample {}".format(i))
goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
n = n+1
goodBClist[var] = set(goodBClist[var])
for i in range(excess_axes_count):
fig.delaxes(axes.flatten()[-(i+1)])
########################################################### MITO genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 2*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
var = "ScoreSignature_ER_stress"
print("--------------------------------------------------------{}--------------------------------------------------------".format(var))
goodBClist[var] = []
n = 0
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
mean = AdataTMP.obs[var].mean()
mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
max = mean+3*mad
axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
axes.flatten()[n].title.set_text("Sample {}".format(i))
goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
n = n+1
goodBClist[var] = set(goodBClist[var])
for i in range(excess_axes_count):
fig.delaxes(axes.flatten()[-(i+1)])
plt.show()
print("\n\n")
########################################################### HBH genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 2*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
var = "ScoreSignature_necroptotic_process"
print("--------------------------------------------------------{}--------------------------------------------------------".format(var))
n = 0
goodBClist[var] = []
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
mean = AdataTMP.obs[var].mean()
mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
max = mean+3*mad
axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
axes.flatten()[n].title.set_text("Sample {}".format(i))
goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
n = n+1
goodBClist[var] = set(goodBClist[var])
for i in range(excess_axes_count):
fig.delaxes(axes.flatten()[-(i+1)])
plt.show()
print("\n\n")
########################################################### HBH genes
fig, axes = plt.subplots(math.ceil(len(adata.obs[FilterGroup].unique())/4),4, figsize=(25, 2*len(adata.obs[FilterGroup].unique())), sharey=False)
plt.subplots_adjust(hspace=1)
var = "ScoreSignature_programmed_necrotic_cell_death"
print("--------------------------------------------------------{}--------------------------------------------------------".format(var))
n = 0
goodBClist[var] = []
#goodBClist = []
for i in adata.obs[FilterGroup].unique():
AdataTMP = adata[adata.obs[FilterGroup] == i].copy()
mean = AdataTMP.obs[var].mean()
mad = abs((AdataTMP.obs[var] - mean)).sum() / AdataTMP.shape[0]
max = mean+.1*mad
axes.flatten()[n] = sc.pl.violin(AdataTMP, keys=var, show=False, ax = axes.flatten()[n])
axes.flatten()[n].axhline(y=max, xmin=0, xmax=1, color="red")
axes.flatten()[n].title.set_text("Sample {}".format(i))
goodBClist[var].extend(AdataTMP.obs_names[AdataTMP.obs[var] <= max].tolist())
n = n+1
goodBClist[var] = set(goodBClist[var])
for i in range(excess_axes_count):
fig.delaxes(axes.flatten()[-(i+1)])
plt.show()
print("\n\n")
--------------------------------------------------------ScoreSignature_ER_stress--------------------------------------------------------
--------------------------------------------------------ScoreSignature_necroptotic_process--------------------------------------------------------
--------------------------------------------------------ScoreSignature_programmed_necrotic_cell_death--------------------------------------------------------
adataPrefilt = adata[list(set.intersection(*list(goodBClist.values())))]
NumberGoodCells = adataPrefilt.shape[0]
GoodCellsRate = adataPrefilt.shape[0]/adata.shape[0]
print("Thresholding like this will results in {} cells kept".format(NumberGoodCells))
print("Which corresponds to {} cells rate".format(round(GoodCellsRate,2)))
Thresholding like this will results in 34621 cells kept Which corresponds to 0.73 cells rate
for i in list(goodBClist.keys()):
adata.obs["PassedQCfilt_{}".format(i)] = adata.obs_names.isin(goodBClist[i])
adata.obs[[i for i in adata.obs.columns if "PassedQCfilt_" in i]].to_csv("./adatas/{}_AdditionalFilters.tsv".format(DS), sep="\t")