##############
## input: combined_scRNA_seq_data.h5ad
## output: combined_scRNA_seq_data_with_genetic_demux_filtered.h5ad
##############
import scanpy as sc
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad
output_dir = "/group/testa/Project/EndPoints/scRNASeq/preAnalysis/output"
# Directory containing aggregatedCall files
demux_dir = '/group/testa/Common/scRNAseq/demultiplexing/endpoints_sc_2024-cellranger_human_GRCh38-2020-A/alessia.valenti/'
cmo | run | condition | line | replicate | |
---|---|---|---|---|---|
AAACCCAAGATGGTAT-1 | 309.0 | 513 | MIX | CTL04E | 3 |
AAACCCAAGCCTGAAG-1 | 311.0 | 513 | MIX | CTL08A | 2 |
AAACCCAAGCTTTCCC-1 | 312.0 | 513 | MIX | CTL08A | 3 |
AAACCCAAGGGATCTG-1 | 307.0 | 513 | MIX | CTL04E | 1 |
AAACCCAAGGGTGAGG-1 | 309.0 | 513 | MIX | CTL04E | 3 |
adata = sc.read_h5ad("/group/testa/Project/EndPoints/scRNASeq/preAnalysis/Archive/combined_scRNA_seq_data_39X.h5ad")
print(f"Loaded dataset shape: {adata.shape}")
Loaded dataset shape: (621573, 36601)
adata_original = adata.copy()
adata.obs['run'] = adata.obs['run'].astype(int)
adata.obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | |
---|---|---|---|---|---|---|---|
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST |
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 501 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST |
print( set([393, 394, *[i for i in range(491, 513)] ]) - set(sorted(list(adata.obs.run.unique()))))
set()
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()
columns = ['run', 'line', 'condition', 'replicate']
for ax, column in zip(axes, columns):
adata.obs[column].value_counts().plot.bar(ax=ax, color='gray')
ax.set_title(column)
ax.set_ylabel('Count')
ax.tick_params(axis='x', rotation=90)
plt.tight_layout()
plt.show()
| | Consensus | run | |-------------------|-----------|-----| AAACCCAAGACTTCGT-1 | CTL08A | 503 | AAACCCAAGAGTCAGC-1 | CTL04E | 503 | AAACCCAAGCCTCAGC-1 | CTL08A | 503 | AAACCCAAGCTACTGT-1 | CTL08A | 503 | AAACCCAAGCTTGTGT-1 | CTL04E | 503 |
# Function to read aggregatedCall files
def read_aggregated_call(file_path):
return pd.read_csv(file_path, sep='\t', index_col='barcode')
# Read all aggregatedCall files
aggregatedCalls = {}
for run_dir in glob.glob(os.path.join(demux_dir, 'MTR_*')):
run_name = os.path.basename(run_dir)
aggregated_call_file = os.path.join(run_dir, 'aggregatedCall', 'aggregatedCall.tsv')
if os.path.exists(aggregated_call_file):
aggregatedCalls[run_name] = read_aggregated_call(aggregated_call_file)
aggregatedCalls[run_name]["run"] = int(run_name.split("_")[-1])
print(sorted(list(aggregatedCalls.keys())))
runs = [int(i.split("_")[-1]) for i in list(aggregatedCalls.keys())]
['MTR_393', 'MTR_394', 'MTR_491', 'MTR_492', 'MTR_493', 'MTR_494', 'MTR_495', 'MTR_496', 'MTR_497', 'MTR_498', 'MTR_499', 'MTR_500', 'MTR_501', 'MTR_502', 'MTR_503', 'MTR_504', 'MTR_505', 'MTR_506', 'MTR_507', 'MTR_508', 'MTR_509', 'MTR_510', 'MTR_511', 'MTR_512', 'MTR_513']
aggregatedCalls['MTR_507'].head()
Consensus | run | |
---|---|---|
barcode | ||
AAACCCAAGACCAACG-1 | CTL04E | 507 |
AAACCCAAGAGGTCGT-1 | CTL08A | 507 |
AAACCCAAGCCATTCA-1 | CTL08A | 507 |
AAACCCAAGCGCAATG-1 | CTL08A | 507 |
AAACCCAAGGAGGCAG-1 | CTL08A | 507 |
print( set( [393, 394, *[i for i in range(491, 513)] ]) - set(sorted(runs)))
set()
aggregatedCalls_combined = pd.concat(aggregatedCalls.values(), ignore_index=False)
aggregatedCalls_combined = aggregatedCalls_combined.rename_axis(None)
aggregatedCalls_combined['run'] = aggregatedCalls_combined['run'].astype(int)
aggregatedCalls_combined = aggregatedCalls_combined.rename(columns={
'Consensus': 'Consensus_gene_demux',
'run': 'run_gene_demux'
})
aggregatedCalls_combined.head()
Consensus_gene_demux | run_gene_demux | |
---|---|---|
AAACCCAAGACTTCGT-1 | CTL08A | 503 |
AAACCCAAGAGTCAGC-1 | CTL04E | 503 |
AAACCCAAGCCTCAGC-1 | CTL08A | 503 |
AAACCCAAGCTACTGT-1 | CTL08A | 503 |
AAACCCAAGCTTGTGT-1 | CTL04E | 503 |
if aggregatedCalls_combined.index.duplicated().any():
duplicate_count = aggregatedCalls_combined.index.duplicated().sum()
print(f"Warning: {duplicate_count} duplicate indices found in aggregatedCalls_combined.")
# processed_data_combined = processed_data_combined[~processed_data_combined.index.duplicated(keep='first')]
# processed_data_combined = aggregatedCalls_combined[~aggregatedCalls_combined.index.duplicated(keep=False)]
# print(f"Removed {duplicate_count} rows. New shape of processed_data_combined: {processed_data_combined.shape}")
Warning: 106982 duplicate indices found in aggregatedCalls_combined.
aggregatedCalls_combined = aggregatedCalls_combined.reset_index()
aggregatedCalls_combined['unique_index'] = aggregatedCalls_combined["index"].astype(str) + '-' + aggregatedCalls_combined['run_gene_demux'].astype(str)
aggregatedCalls_combined = aggregatedCalls_combined.set_index('unique_index')
aggregatedCalls_combined.head()
index | Consensus_gene_demux | run_gene_demux | |
---|---|---|---|
unique_index | |||
AAACCCAAGACTTCGT-1-503 | AAACCCAAGACTTCGT-1 | CTL08A | 503 |
AAACCCAAGAGTCAGC-1-503 | AAACCCAAGAGTCAGC-1 | CTL04E | 503 |
AAACCCAAGCCTCAGC-1-503 | AAACCCAAGCCTCAGC-1 | CTL08A | 503 |
AAACCCAAGCTACTGT-1-503 | AAACCCAAGCTACTGT-1 | CTL08A | 503 |
AAACCCAAGCTTGTGT-1-503 | AAACCCAAGCTTGTGT-1 | CTL04E | 503 |
if aggregatedCalls_combined.index.duplicated().any():
duplicate_count = aggregatedCalls_combined.index.duplicated().sum()
print(f"Warning: {duplicate_count} duplicate indices found in aggregatedCalls_combined.")
runs_aggregatedCalls = set(aggregatedCalls_combined['run_gene_demux'].unique())
print(f"runs_aggregatedCalls_combined: {runs_aggregatedCalls}")
runs_adata = set(adata.obs['run'].unique())
print(f"runs_adata: {runs_adata}")
print(f"Runs not shared: {runs_aggregatedCalls.symmetric_difference(runs_adata)}")
runs_aggregatedCalls_combined: {512, 513, 393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511} runs_adata: {512, 513, 393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511} Runs not shared: set()
# runs_to_keep = runs_aggregatedCalls.intersection(runs_adata)
# Remove non-shared runs from aggregatedCalls_combined
# aggregatedCalls_combined = aggregatedCalls_combined[aggregatedCalls_combined['run_gene_demux'].isin(runs_to_keep)]
# Remove non-shared runs from adata
# adata = adata[adata.obs['run'].isin(runs_to_keep)]
fig, axes = plt.subplots(1, 2, figsize=(12, 8))
axes = axes.flatten()
columns = ['Consensus_gene_demux', 'run_gene_demux']
for ax, column in zip(axes, columns):
aggregatedCalls_combined[column].value_counts().plot.bar(ax=ax, color='gray')
ax.set_title(column)
ax.set_ylabel('Count')
ax.tick_params(axis='x', rotation=90)
plt.tight_layout()
plt.show()
adata.obs = adata.obs.join(aggregatedCalls_combined, how='left')
print(sorted(list(adata.obs.run.unique())))
[393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513]
total_rows_aggregatedCalls_combined = aggregatedCalls_combined.shape[0]
total_rows_adata = adata.obs.shape[0]
# Count rows where new columns are not NaN
combined_rows = adata.obs.dropna(subset=['Consensus_gene_demux', 'run_gene_demux']).shape[0]
print(f"Total rows in adata.obs: {total_rows_adata}")
print(f"Total rows in aggregatedCalls_combined: {total_rows_aggregatedCalls_combined}")
print(f"Rows in adata.obs with data from aggregatedCalls_combined: {combined_rows}")
print('\n')
percentage_combined = (combined_rows / total_rows_adata) * 100
print(f"Percentage of rows from adata.obs: {percentage_combined:.2f}%")
percentage_combined = (combined_rows / total_rows_aggregatedCalls_combined) * 100
print(f"Percentage of rows from aggregatedCalls_combined: {percentage_combined:.2f}%")
Total rows in adata.obs: 621573 Total rows in aggregatedCalls_combined: 937779 Rows in adata.obs with data from aggregatedCalls_combined: 621573 Percentage of rows from adata.obs: 100.00% Percentage of rows from aggregatedCalls_combined: 66.28%
# Check for any rows in adata.obs that didn't match
unmatched_rows = adata.obs.index.difference(aggregatedCalls_combined.index)
print(f"Number of rows in adata.obs that didn't match to aggregatedCalls_combined: {len(unmatched_rows)}")
if len(unmatched_rows) > 0:
print("First few unmatched row indexes:")
print(unmatched_rows[:5].tolist())
Number of rows in adata.obs that didn't match to aggregatedCalls_combined: 0
# Check for any rows in aggregatedCalls_combined that didn't match
unmatched_rows = aggregatedCalls_combined.index.difference(adata.obs.index)
print(f"Number of rows in aggregatedCalls_combined that didn't match to adata.obs: {len(unmatched_rows)}")
if len(unmatched_rows) > 0:
print("First few unmatched row indexes:")
print(unmatched_rows[:5].tolist())
Number of rows in aggregatedCalls_combined that didn't match to adata.obs: 316206 First few unmatched row indexes: ['AAACCCAAGAACTGAT-1-501', 'AAACCCAAGAAGCGCT-1-491', 'AAACCCAAGAAGCTCG-1-505', 'AAACCCAAGAAGTGTT-1-493', 'AAACCCAAGACAAGCC-1-510']
unmatched = aggregatedCalls_combined.loc[unmatched_rows]
unmatched.head()
index | Consensus_gene_demux | run_gene_demux | |
---|---|---|---|
AAACCCAAGAACTGAT-1-501 | AAACCCAAGAACTGAT-1 | doublet | 501 |
AAACCCAAGAAGCGCT-1-491 | AAACCCAAGAAGCGCT-1 | doublet | 491 |
AAACCCAAGAAGCTCG-1-505 | AAACCCAAGAAGCTCG-1 | CTL04E | 505 |
AAACCCAAGAAGTGTT-1-493 | AAACCCAAGAAGTGTT-1 | CTL08A | 493 |
AAACCCAAGACAAGCC-1-510 | AAACCCAAGACAAGCC-1 | CTL04E | 510 |
sorted_runs = sorted(unmatched['run_gene_demux'].unique())
unmatched['run_gene_demux'].value_counts().reindex(sorted_runs).plot.bar(color='orange')
<Axes: xlabel='run_gene_demux'>
unmatched['Consensus_gene_demux'].unique()
array(['doublet', 'CTL04E', 'CTL08A', 'LowQuality'], dtype=object)
unmatched['Consensus_gene_demux'].value_counts().plot.bar(color=['orange'])
<Axes: xlabel='Consensus_gene_demux'>
# Count the occurrences of each run_gene_demux
run_counts = unmatched['run_gene_demux'].value_counts()
# Calculate the counts of doublets for each run
doublet_counts = unmatched[unmatched['Consensus_gene_demux'] == 'doublet']['run_gene_demux'].value_counts()
# Ensure all runs are represented in doublet_counts
doublet_counts = doublet_counts.reindex(run_counts.index, fill_value=0)
# Calculate the non-doublet counts
non_doublet_counts = run_counts - doublet_counts
df = pd.DataFrame({'Non-doublet': non_doublet_counts, 'Doublet': doublet_counts})
df['Total'] = df['Non-doublet'] + df['Doublet']
df = df.sort_values('Total', ascending=False)
fig, ax = plt.subplots(figsize=(15, 8))
# Plot the stacked bar chart
bar_width = 0.8
bars = ax.bar(range(len(df)), df['Non-doublet'], color='orange', width=bar_width, label='Non-doublet')
ax.bar(range(len(df)), df['Doublet'], bottom=df['Non-doublet'], color='red', width=bar_width, label='Doublet')
ax.set_xlabel('run_gene_demux')
ax.set_ylabel('Count')
ax.set_title('run_gene_demux Counts with Doublet Proportion')
ax.set_xticks(range(len(df)))
ax.set_xticklabels(df.index, rotation=90, ha='right')
ax.legend()
plt.tight_layout()
plt.show()
adata
AnnData object with n_obs × n_vars = 621573 × 36601 obs: 'tags', 'run', 'cmo', 'condition', 'line', 'replicate', 'condition_clean', 'index', 'Consensus_gene_demux', 'run_gene_demux' var: 'gene_ids', 'feature_types', 'genome'
adata.obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | index | Consensus_gene_demux | run_gene_demux | |
---|---|---|---|---|---|---|---|---|---|---|
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST | AAACCCAAGAAACCCG-1 | doublet | 501 |
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST | AAACCCAAGGACGCAT-1 | CTL08A | 501 |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 501 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST | AAACCCAAGTTGAAAC-1 | CTL08A | 501 |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST | AAACCCACAACGGGTA-1 | CTL08A | 501 |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST | AAACCCACAAGATCCT-1 | CTL04E | 501 |
print(sorted(list(adata.obs.run.unique())))
[393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513]
# adata.write_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data_with_genetic_demux.h5ad'))
print(f"Number of cells before filtering: {adata.n_obs}")
Number of cells before filtering: 621573
adata.obs[adata.obs.run == 508].head()
tags | run | cmo | condition | line | replicate | condition_clean | index | Consensus_gene_demux | run_gene_demux | |
---|---|---|---|---|---|---|---|---|---|---|
AAACCCAAGAGGTCGT-1-508 | AAACCCAAGAGGTCGT-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGAGGTCGT-1 | CTL08A | 508 |
AAACCCAAGATTGGGC-1-508 | AAACCCAAGATTGGGC-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGATTGGGC-1 | doublet | 508 |
AAACCCAAGCAATTAG-1-508 | AAACCCAAGCAATTAG-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGCAATTAG-1 | doublet | 508 |
AAACCCAAGCTGCCAC-1-508 | AAACCCAAGCTGCCAC-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGCTGCCAC-1 | doublet | 508 |
AAACCCAAGGATTCCT-1-508 | AAACCCAAGGATTCCT-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGGATTCCT-1 | CTL08A | 508 |
mask = ~adata.obs['Consensus_gene_demux'].isna() & (adata.obs['Consensus_gene_demux'] != '')
adata = adata[mask, :]
print(f"Number of cells after filtering NaN: {adata.n_obs}")
Number of cells after filtering NaN: 621573
mask = adata.obs['Consensus_gene_demux'] != 'LowQuality'
adata = adata[mask, :]
print(f"Number of cells after filtering LowQuality: {adata.n_obs}")
Number of cells after filtering LowQuality: 605046
mask = adata.obs['Consensus_gene_demux'] != 'doublet'
adata = adata[mask, :]
print(f"Number of cells after filtering doublet: {adata.n_obs}")
Number of cells after filtering doublet: 494164
adata.obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | index | Consensus_gene_demux | run_gene_demux | |
---|---|---|---|---|---|---|---|---|---|---|
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST | AAACCCAAGGACGCAT-1 | CTL08A | 501 |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 501 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST | AAACCCAAGTTGAAAC-1 | CTL08A | 501 |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST | AAACCCACAACGGGTA-1 | CTL08A | 501 |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST | AAACCCACAAGATCCT-1 | CTL04E | 501 |
AAACCCACAATCAAGA-1-501 | AAACCCACAATCAAGA-1 | 501 | 301.0 | THYROID_AGONIST | CTL04E | 1 | THYROID_AGONIST | AAACCCACAATCAAGA-1 | CTL04E | 501 |
print(sorted(list(adata.obs.run.unique())))
[393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513]
adata.obs.Consensus_gene_demux.unique()
array(['CTL08A', 'CTL04E', 'CTL01A', 'CTL02A'], dtype=object)
# adata.write_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data_with_genetic_demux_filtered.h5ad'))
# adata = ad.read_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data_with_genetic_demux_filtered.h5ad'))
adata[adata.obs['Consensus_gene_demux'].isin(['CTL01A', 'CTL02A'])].obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | index | Consensus_gene_demux | run_gene_demux | is_consistent | |
---|---|---|---|---|---|---|---|---|---|---|---|
AAACCCAAGCACTCAT-1-393 | AAACCCAAGCACTCAT-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN | AAACCCAAGCACTCAT-1 | CTL01A | 393 | True |
AAACCCAAGCCTGCCA-1-393 | AAACCCAAGCCTGCCA-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN | AAACCCAAGCCTGCCA-1 | CTL01A | 393 | True |
AAACCCAAGGGATGTC-1-393 | AAACCCAAGGGATGTC-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN | AAACCCAAGGGATGTC-1 | CTL01A | 393 | True |
AAACCCACAATCGCAT-1-393 | AAACCCACAATCGCAT-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN | AAACCCACAATCGCAT-1 | CTL02A | 393 | True |
AAACCCACAGCTGAAG-1-393 | AAACCCACAGCTGAAG-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN | AAACCCACAGCTGAAG-1 | CTL01A | 393 | True |
adata[adata.obs.run == 508].obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | index | Consensus_gene_demux | run_gene_demux | is_consistent | |
---|---|---|---|---|---|---|---|---|---|---|---|
AAACCCAAGAGGTCGT-1-508 | AAACCCAAGAGGTCGT-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGAGGTCGT-1 | CTL08A | 508 | True |
AAACCCAAGGATTCCT-1-508 | AAACCCAAGGATTCCT-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGGATTCCT-1 | CTL08A | 508 | True |
AAACCCAAGGGAACAA-1-508 | AAACCCAAGGGAACAA-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGGGAACAA-1 | CTL04E | 508 | True |
AAACCCAAGGTTAAAC-1-508 | AAACCCAAGGTTAAAC-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGGTTAAAC-1 | CTL08A | 508 | True |
AAACCCAAGTCGTTAC-1-508 | AAACCCAAGTCGTTAC-1 | 508 | NaN | RET_AGONIST | NaN | NaN | RET_AGONIST | AAACCCAAGTCGTTAC-1 | CTL04E | 508 | True |
def check_consistency(df):
# Create a mask for rows to include in the consistency check
include_mask = ~df.run.isin([508,393,394])
run_consistent = (df['run'] == df['run_gene_demux'])
tag_consistent = (df['tags'] == df['index'])
# Convert categorical columns to string for comparison
line_str = df['line'].astype(str)
consensus_str = df['Consensus_gene_demux'].astype(str)
line_consistent = ((line_str == consensus_str) | ~include_mask)
return run_consistent & line_consistent & tag_consistent
# consistency check
adata.obs['is_consistent'] = check_consistency(adata.obs)
# Count inconsistencies and calculate consistency percentage
total_count = len(adata.obs)
included_count = (~adata.obs.run.isin([508,393,394])).sum()
inconsistent_count = (~adata.obs['is_consistent']).sum()
consistency_percentage = (adata.obs['is_consistent'].sum()/ total_count) * 100
print(f"Total rows: {total_count}")
print(f"Rows included in consistency check: {included_count}")
print(f"Inconsistent rows: {inconsistent_count}")
print(f"Consistency percentage: {consistency_percentage:.2f}%")
Total rows: 494164 Rows included in consistency check: 409234 Inconsistent rows: 13664 Consistency percentage: 97.23%
none_count = adata.obs['is_consistent'].isna().sum()
print(f"Number of None values remaining: {none_count}")
Number of None values remaining: 0
adata.obs['is_consistent'] = adata.obs['is_consistent'].fillna(True)
adata.obs[~adata.obs['is_consistent']].head()
tags | run | cmo | condition | line | replicate | condition_clean | index | Consensus_gene_demux | run_gene_demux | is_consistent | |
---|---|---|---|---|---|---|---|---|---|---|---|
AAACGAAAGGGAGGGT-1-501 | AAACGAAAGGGAGGGT-1 | 501 | 302.0 | THYROID_AGONIST | CTL04E | 2 | THYROID_AGONIST | AAACGAAAGGGAGGGT-1 | CTL08A | 501 | False |
AAACGAACAGCGAGTA-1-501 | AAACGAACAGCGAGTA-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST | AAACGAACAGCGAGTA-1 | CTL08A | 501 | False |
AAACGCTTCCCAGTGG-1-501 | AAACGCTTCCCAGTGG-1 | 501 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST | AAACGCTTCCCAGTGG-1 | CTL04E | 501 | False |
AAAGGGCGTACGGGAT-1-501 | AAAGGGCGTACGGGAT-1 | 501 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST | AAAGGGCGTACGGGAT-1 | CTL04E | 501 | False |
AAAGTCCCAAAGACTA-1-501 | AAAGTCCCAAAGACTA-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST | AAAGTCCCAAAGACTA-1 | CTL04E | 501 | False |
print(adata.obs.Consensus_gene_demux.unique())
print(adata.obs.line.unique())
['CTL08A' 'CTL04E' 'CTL01A' 'CTL02A'] ['CTL08A', 'CTL04E', NaN] Categories (2, object): ['CTL04E', 'CTL08A']
if inconsistent_count > 0:
# Analyze types of inconsistencies
run_mismatch = adata.obs[adata.obs['run'] != adata.obs['run_gene_demux']]
tag_mismatch = adata.obs[adata.obs['tags'] != adata.obs['index']]
# Convert categorical columns to string for comparison
line_str = adata.obs['line'].astype(str)
consensus_str = adata.obs['Consensus_gene_demux'].astype(str)
# Exclude 'CTL01A' and 'CTL02A' from line mismatch check
line_mismatch = adata.obs[
(line_str != consensus_str) &
~(adata.obs['Consensus_gene_demux'].isin(['CTL01A', 'CTL02A']) | (adata.obs.run == 508))
]
print(f"Rows with mismatched run: {len(run_mismatch)}")
print(f"Rows with mismatched tags: {len(tag_mismatch)}")
print(f"Rows with mismatched line/Consensus: {len(line_mismatch)}")
Rows with mismatched run: 0 Rows with mismatched tags: 0 Rows with mismatched line/Consensus: 47480
n_cells_before = adata.n_obs
# Filter the AnnData object to keep only consistent records
adata_filtered = adata[adata.obs['is_consistent']]
n_cells_after = adata_filtered.n_obs
print(f"Number of cells before filtering: {n_cells_before}")
print(f"Number of cells after filtering: {n_cells_after}")
print(f"Number of cells removed: {n_cells_before - n_cells_after}")
print(f"Percentage of cells kept: {(n_cells_after / n_cells_before) * 100:.2f}%")
Number of cells before filtering: 494164 Number of cells after filtering: 480500 Number of cells removed: 13664 Percentage of cells kept: 97.23%
adata_filtered.obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | index | Consensus_gene_demux | run_gene_demux | is_consistent | |
---|---|---|---|---|---|---|---|---|---|---|---|
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST | AAACCCAAGGACGCAT-1 | CTL08A | 501 | True |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 501 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST | AAACCCAAGTTGAAAC-1 | CTL08A | 501 | True |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST | AAACCCACAACGGGTA-1 | CTL08A | 501 | True |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST | AAACCCACAAGATCCT-1 | CTL04E | 501 | True |
AAACCCACAATCAAGA-1-501 | AAACCCACAATCAAGA-1 | 501 | 301.0 | THYROID_AGONIST | CTL04E | 1 | THYROID_AGONIST | AAACCCACAATCAAGA-1 | CTL04E | 501 | True |
adata_filtered.obs = adata_filtered.obs.drop(columns=['is_consistent'])
adata_filtered.obs = adata_filtered.obs.drop(columns=['index'])
adata_filtered.obs = adata_filtered.obs.drop(columns=['run'])
adata_filtered.obs = adata_filtered.obs.drop(columns=['line'])
adata_filtered.obs.head()
tags | cmo | condition | replicate | condition_clean | Consensus_gene_demux | run_gene_demux | |
---|---|---|---|---|---|---|---|
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 306.0 | THYROID_AGONIST | 3 | THYROID_AGONIST | CTL08A | 501 |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 305.0 | THYROID_AGONIST | 2 | THYROID_AGONIST | CTL08A | 501 |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 306.0 | THYROID_AGONIST | 3 | THYROID_AGONIST | CTL08A | 501 |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 303.0 | THYROID_AGONIST | 3 | THYROID_AGONIST | CTL04E | 501 |
AAACCCACAATCAAGA-1-501 | AAACCCACAATCAAGA-1 | 301.0 | THYROID_AGONIST | 1 | THYROID_AGONIST | CTL04E | 501 |
adata_filtered.obs = adata_filtered.obs.rename(columns={
'Consensus_gene_demux': 'line',
'run_gene_demux': 'run'
})
adata_filtered.obs['cmo'] = pd.to_numeric(adata_filtered.obs['cmo'], errors='coerce')
# Replace NaN with the specified value
adata_filtered.obs['cmo'] = adata_filtered.obs['cmo'].fillna(-1)
# Convert to integer
adata_filtered.obs['cmo'] = adata_filtered.obs['cmo'].astype(int)
# Print some information about the conversion
print(f"Unique values in 'cmo' after conversion: {adata_filtered.obs['cmo'].unique()}")
print(f"Number of -1 values (originally NaN): {(adata_filtered.obs['cmo'] == -1).sum()}")
Unique values in 'cmo' after conversion: [306 305 303 301 302 304 312 310 309 308 307 311 -1] Number of -1 values (originally NaN): 84930
adata_filtered[adata_filtered.obs.cmo==-1].obs.head()
tags | cmo | condition | replicate | condition_clean | line | run | |
---|---|---|---|---|---|---|---|
AAACCCAAGAGGTCGT-1-508 | AAACCCAAGAGGTCGT-1 | -1 | RET_AGONIST | NaN | RET_AGONIST | CTL08A | 508 |
AAACCCAAGGATTCCT-1-508 | AAACCCAAGGATTCCT-1 | -1 | RET_AGONIST | NaN | RET_AGONIST | CTL08A | 508 |
AAACCCAAGGGAACAA-1-508 | AAACCCAAGGGAACAA-1 | -1 | RET_AGONIST | NaN | RET_AGONIST | CTL04E | 508 |
AAACCCAAGGTTAAAC-1-508 | AAACCCAAGGTTAAAC-1 | -1 | RET_AGONIST | NaN | RET_AGONIST | CTL08A | 508 |
AAACCCAAGTCGTTAC-1-508 | AAACCCAAGTCGTTAC-1 | -1 | RET_AGONIST | NaN | RET_AGONIST | CTL04E | 508 |
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()
columns = ['run', 'line', 'condition', 'replicate']
for ax, column in zip(axes, columns):
adata_filtered.obs[column].value_counts().plot.bar(ax=ax, color='gray')
ax.set_title(column)
ax.set_ylabel('Count')
ax.tick_params(axis='x', rotation=90)
plt.tight_layout()
plt.show()
adata_filtered
AnnData object with n_obs × n_vars = 480500 × 36601 obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run' var: 'gene_ids', 'feature_types', 'genome'
adata_filtered.write_h5ad('./adatas/combined_scRNA_seq_data_with_genetic_demux_filtered_consistent.h5ad')