In [1]:
##############
## input: combined_scRNA_seq_data.h5ad
## output: combined_scRNA_seq_data_with_genetic_demux_filtered.h5ad
##############

1. Environment¶

In [2]:
import scanpy as sc
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad
In [3]:
output_dir = "/group/testa/Project/EndPoints/scRNASeq/preAnalysis/output"

# Directory containing aggregatedCall files
demux_dir = '/group/testa/Common/scRNAseq/demultiplexing/endpoints_sc_2024-cellranger_human_GRCh38-2020-A/alessia.valenti/'

2. Load adata¶

cmo run condition line replicate
AAACCCAAGATGGTAT-1 309.0 513 MIX CTL04E 3
AAACCCAAGCCTGAAG-1 311.0 513 MIX CTL08A 2
AAACCCAAGCTTTCCC-1 312.0 513 MIX CTL08A 3
AAACCCAAGGGATCTG-1 307.0 513 MIX CTL04E 1
AAACCCAAGGGTGAGG-1 309.0 513 MIX CTL04E 3
In [4]:
adata = sc.read_h5ad("/group/testa/Project/EndPoints/scRNASeq/preAnalysis/Archive/combined_scRNA_seq_data_39X.h5ad")
print(f"Loaded dataset shape: {adata.shape}")
Loaded dataset shape: (621573, 36601)
In [5]:
adata_original = adata.copy()
In [6]:
adata.obs['run'] = adata.obs['run'].astype(int)
In [7]:
adata.obs.head()
Out[7]:
tags run cmo condition line replicate condition_clean
AAACCCAAGAAACCCG-1-501 AAACCCAAGAAACCCG-1 501 303.0 THYROID_AGONIST CTL04E 3 THYROID_AGONIST
AAACCCAAGGACGCAT-1-501 AAACCCAAGGACGCAT-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST
AAACCCAAGTTGAAAC-1-501 AAACCCAAGTTGAAAC-1 501 305.0 THYROID_AGONIST CTL08A 2 THYROID_AGONIST
AAACCCACAACGGGTA-1-501 AAACCCACAACGGGTA-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST
AAACCCACAAGATCCT-1-501 AAACCCACAAGATCCT-1 501 303.0 THYROID_AGONIST CTL04E 3 THYROID_AGONIST
In [8]:
print( set([393, 394, *[i for i in range(491, 513)] ]) - set(sorted(list(adata.obs.run.unique()))))
set()
In [9]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes = axes.flatten()

columns = ['run', 'line', 'condition', 'replicate']

for ax, column in zip(axes, columns):
    adata.obs[column].value_counts().plot.bar(ax=ax, color='gray')
    ax.set_title(column)
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

3. Load aggregatedCall data¶

| | Consensus | run | |-------------------|-----------|-----| AAACCCAAGACTTCGT-1 | CTL08A | 503 | AAACCCAAGAGTCAGC-1 | CTL04E | 503 | AAACCCAAGCCTCAGC-1 | CTL08A | 503 | AAACCCAAGCTACTGT-1 | CTL08A | 503 | AAACCCAAGCTTGTGT-1 | CTL04E | 503 |

In [10]:
# Function to read aggregatedCall files
def read_aggregated_call(file_path):
    return pd.read_csv(file_path, sep='\t', index_col='barcode')
In [11]:
# Read all aggregatedCall files
aggregatedCalls = {}
for run_dir in glob.glob(os.path.join(demux_dir, 'MTR_*')):
    run_name = os.path.basename(run_dir)
    aggregated_call_file = os.path.join(run_dir, 'aggregatedCall', 'aggregatedCall.tsv')
    if os.path.exists(aggregated_call_file):
        aggregatedCalls[run_name] = read_aggregated_call(aggregated_call_file)
        aggregatedCalls[run_name]["run"] = int(run_name.split("_")[-1])
In [12]:
print(sorted(list(aggregatedCalls.keys())))

runs = [int(i.split("_")[-1]) for i in list(aggregatedCalls.keys())]
['MTR_393', 'MTR_394', 'MTR_491', 'MTR_492', 'MTR_493', 'MTR_494', 'MTR_495', 'MTR_496', 'MTR_497', 'MTR_498', 'MTR_499', 'MTR_500', 'MTR_501', 'MTR_502', 'MTR_503', 'MTR_504', 'MTR_505', 'MTR_506', 'MTR_507', 'MTR_508', 'MTR_509', 'MTR_510', 'MTR_511', 'MTR_512', 'MTR_513']
In [13]:
aggregatedCalls['MTR_507'].head()
Out[13]:
Consensus run
barcode
AAACCCAAGACCAACG-1 CTL04E 507
AAACCCAAGAGGTCGT-1 CTL08A 507
AAACCCAAGCCATTCA-1 CTL08A 507
AAACCCAAGCGCAATG-1 CTL08A 507
AAACCCAAGGAGGCAG-1 CTL08A 507
In [14]:
print( set( [393, 394, *[i for i in range(491, 513)] ]) - set(sorted(runs)))
set()
In [15]:
aggregatedCalls_combined = pd.concat(aggregatedCalls.values(), ignore_index=False)
In [16]:
aggregatedCalls_combined = aggregatedCalls_combined.rename_axis(None)
In [17]:
aggregatedCalls_combined['run'] = aggregatedCalls_combined['run'].astype(int)
In [18]:
aggregatedCalls_combined = aggregatedCalls_combined.rename(columns={
    'Consensus': 'Consensus_gene_demux',
    'run': 'run_gene_demux'
})
In [19]:
aggregatedCalls_combined.head()
Out[19]:
Consensus_gene_demux run_gene_demux
AAACCCAAGACTTCGT-1 CTL08A 503
AAACCCAAGAGTCAGC-1 CTL04E 503
AAACCCAAGCCTCAGC-1 CTL08A 503
AAACCCAAGCTACTGT-1 CTL08A 503
AAACCCAAGCTTGTGT-1 CTL04E 503
In [20]:
if aggregatedCalls_combined.index.duplicated().any():
    duplicate_count = aggregatedCalls_combined.index.duplicated().sum()
    print(f"Warning: {duplicate_count} duplicate indices found in aggregatedCalls_combined.")
    # processed_data_combined = processed_data_combined[~processed_data_combined.index.duplicated(keep='first')]
    # processed_data_combined = aggregatedCalls_combined[~aggregatedCalls_combined.index.duplicated(keep=False)]
    # print(f"Removed {duplicate_count} rows. New shape of processed_data_combined: {processed_data_combined.shape}")
Warning: 106982 duplicate indices found in aggregatedCalls_combined.
In [21]:
aggregatedCalls_combined = aggregatedCalls_combined.reset_index()
aggregatedCalls_combined['unique_index'] = aggregatedCalls_combined["index"].astype(str) + '-' + aggregatedCalls_combined['run_gene_demux'].astype(str)
aggregatedCalls_combined = aggregatedCalls_combined.set_index('unique_index')
In [22]:
aggregatedCalls_combined.head()
Out[22]:
index Consensus_gene_demux run_gene_demux
unique_index
AAACCCAAGACTTCGT-1-503 AAACCCAAGACTTCGT-1 CTL08A 503
AAACCCAAGAGTCAGC-1-503 AAACCCAAGAGTCAGC-1 CTL04E 503
AAACCCAAGCCTCAGC-1-503 AAACCCAAGCCTCAGC-1 CTL08A 503
AAACCCAAGCTACTGT-1-503 AAACCCAAGCTACTGT-1 CTL08A 503
AAACCCAAGCTTGTGT-1-503 AAACCCAAGCTTGTGT-1 CTL04E 503
In [23]:
if aggregatedCalls_combined.index.duplicated().any():
    duplicate_count = aggregatedCalls_combined.index.duplicated().sum()
    print(f"Warning: {duplicate_count} duplicate indices found in aggregatedCalls_combined.")
In [24]:
runs_aggregatedCalls = set(aggregatedCalls_combined['run_gene_demux'].unique())
print(f"runs_aggregatedCalls_combined: {runs_aggregatedCalls}")
runs_adata = set(adata.obs['run'].unique())
print(f"runs_adata: {runs_adata}")

print(f"Runs not shared: {runs_aggregatedCalls.symmetric_difference(runs_adata)}")
runs_aggregatedCalls_combined: {512, 513, 393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511}
runs_adata: {512, 513, 393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511}
Runs not shared: set()
In [25]:
# runs_to_keep = runs_aggregatedCalls.intersection(runs_adata)

# Remove non-shared runs from aggregatedCalls_combined
# aggregatedCalls_combined = aggregatedCalls_combined[aggregatedCalls_combined['run_gene_demux'].isin(runs_to_keep)]

# Remove non-shared runs from adata
# adata = adata[adata.obs['run'].isin(runs_to_keep)]
In [26]:
fig, axes = plt.subplots(1, 2, figsize=(12, 8))

axes = axes.flatten()

columns = ['Consensus_gene_demux', 'run_gene_demux']

for ax, column in zip(axes, columns):
    aggregatedCalls_combined[column].value_counts().plot.bar(ax=ax, color='gray')
    ax.set_title(column)
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

4. Combine data¶

In [27]:
adata.obs = adata.obs.join(aggregatedCalls_combined, how='left')
In [28]:
print(sorted(list(adata.obs.run.unique())))
[393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513]
In [29]:
total_rows_aggregatedCalls_combined = aggregatedCalls_combined.shape[0]
total_rows_adata = adata.obs.shape[0]

# Count rows where new columns are not NaN
combined_rows = adata.obs.dropna(subset=['Consensus_gene_demux', 'run_gene_demux']).shape[0]

print(f"Total rows in adata.obs: {total_rows_adata}")
print(f"Total rows in aggregatedCalls_combined: {total_rows_aggregatedCalls_combined}")
print(f"Rows in adata.obs with data from aggregatedCalls_combined: {combined_rows}")
print('\n')
percentage_combined = (combined_rows / total_rows_adata) * 100
print(f"Percentage of rows from adata.obs: {percentage_combined:.2f}%")
percentage_combined = (combined_rows / total_rows_aggregatedCalls_combined) * 100
print(f"Percentage of rows from aggregatedCalls_combined: {percentage_combined:.2f}%")
Total rows in adata.obs: 621573
Total rows in aggregatedCalls_combined: 937779
Rows in adata.obs with data from aggregatedCalls_combined: 621573


Percentage of rows from adata.obs: 100.00%
Percentage of rows from aggregatedCalls_combined: 66.28%
In [30]:
# Check for any rows in adata.obs that didn't match
unmatched_rows = adata.obs.index.difference(aggregatedCalls_combined.index)
print(f"Number of rows in adata.obs that didn't match to aggregatedCalls_combined: {len(unmatched_rows)}")

if len(unmatched_rows) > 0:
    print("First few unmatched row indexes:")
    print(unmatched_rows[:5].tolist())
Number of rows in adata.obs that didn't match to aggregatedCalls_combined: 0
In [31]:
# Check for any rows in aggregatedCalls_combined that didn't match
unmatched_rows = aggregatedCalls_combined.index.difference(adata.obs.index)
print(f"Number of rows in aggregatedCalls_combined that didn't match to adata.obs: {len(unmatched_rows)}")

if len(unmatched_rows) > 0:
    print("First few unmatched row indexes:")
    print(unmatched_rows[:5].tolist())
Number of rows in aggregatedCalls_combined that didn't match to adata.obs: 316206
First few unmatched row indexes:
['AAACCCAAGAACTGAT-1-501', 'AAACCCAAGAAGCGCT-1-491', 'AAACCCAAGAAGCTCG-1-505', 'AAACCCAAGAAGTGTT-1-493', 'AAACCCAAGACAAGCC-1-510']
In [32]:
unmatched = aggregatedCalls_combined.loc[unmatched_rows]
unmatched.head()
Out[32]:
index Consensus_gene_demux run_gene_demux
AAACCCAAGAACTGAT-1-501 AAACCCAAGAACTGAT-1 doublet 501
AAACCCAAGAAGCGCT-1-491 AAACCCAAGAAGCGCT-1 doublet 491
AAACCCAAGAAGCTCG-1-505 AAACCCAAGAAGCTCG-1 CTL04E 505
AAACCCAAGAAGTGTT-1-493 AAACCCAAGAAGTGTT-1 CTL08A 493
AAACCCAAGACAAGCC-1-510 AAACCCAAGACAAGCC-1 CTL04E 510
In [33]:
sorted_runs = sorted(unmatched['run_gene_demux'].unique())
unmatched['run_gene_demux'].value_counts().reindex(sorted_runs).plot.bar(color='orange')
Out[33]:
<Axes: xlabel='run_gene_demux'>
In [34]:
unmatched['Consensus_gene_demux'].unique()
Out[34]:
array(['doublet', 'CTL04E', 'CTL08A', 'LowQuality'], dtype=object)
In [35]:
unmatched['Consensus_gene_demux'].value_counts().plot.bar(color=['orange'])
Out[35]:
<Axes: xlabel='Consensus_gene_demux'>
In [36]:
# Count the occurrences of each run_gene_demux
run_counts = unmatched['run_gene_demux'].value_counts()

# Calculate the counts of doublets for each run
doublet_counts = unmatched[unmatched['Consensus_gene_demux'] == 'doublet']['run_gene_demux'].value_counts()

# Ensure all runs are represented in doublet_counts
doublet_counts = doublet_counts.reindex(run_counts.index, fill_value=0)

# Calculate the non-doublet counts
non_doublet_counts = run_counts - doublet_counts

df = pd.DataFrame({'Non-doublet': non_doublet_counts, 'Doublet': doublet_counts})
df['Total'] = df['Non-doublet'] + df['Doublet']
df = df.sort_values('Total', ascending=False)

fig, ax = plt.subplots(figsize=(15, 8))

# Plot the stacked bar chart
bar_width = 0.8
bars = ax.bar(range(len(df)), df['Non-doublet'], color='orange', width=bar_width, label='Non-doublet')
ax.bar(range(len(df)), df['Doublet'], bottom=df['Non-doublet'], color='red', width=bar_width, label='Doublet')

ax.set_xlabel('run_gene_demux')
ax.set_ylabel('Count')
ax.set_title('run_gene_demux Counts with Doublet Proportion')

ax.set_xticks(range(len(df)))
ax.set_xticklabels(df.index, rotation=90, ha='right')

ax.legend()

plt.tight_layout()
plt.show()
In [37]:
adata
Out[37]:
AnnData object with n_obs × n_vars = 621573 × 36601
    obs: 'tags', 'run', 'cmo', 'condition', 'line', 'replicate', 'condition_clean', 'index', 'Consensus_gene_demux', 'run_gene_demux'
    var: 'gene_ids', 'feature_types', 'genome'
In [38]:
adata.obs.head()
Out[38]:
tags run cmo condition line replicate condition_clean index Consensus_gene_demux run_gene_demux
AAACCCAAGAAACCCG-1-501 AAACCCAAGAAACCCG-1 501 303.0 THYROID_AGONIST CTL04E 3 THYROID_AGONIST AAACCCAAGAAACCCG-1 doublet 501
AAACCCAAGGACGCAT-1-501 AAACCCAAGGACGCAT-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST AAACCCAAGGACGCAT-1 CTL08A 501
AAACCCAAGTTGAAAC-1-501 AAACCCAAGTTGAAAC-1 501 305.0 THYROID_AGONIST CTL08A 2 THYROID_AGONIST AAACCCAAGTTGAAAC-1 CTL08A 501
AAACCCACAACGGGTA-1-501 AAACCCACAACGGGTA-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST AAACCCACAACGGGTA-1 CTL08A 501
AAACCCACAAGATCCT-1-501 AAACCCACAAGATCCT-1 501 303.0 THYROID_AGONIST CTL04E 3 THYROID_AGONIST AAACCCACAAGATCCT-1 CTL04E 501
In [39]:
print(sorted(list(adata.obs.run.unique())))
[393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513]
In [40]:
# adata.write_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data_with_genetic_demux.h5ad'))
In [41]:
print(f"Number of cells before filtering: {adata.n_obs}")
Number of cells before filtering: 621573
In [42]:
adata.obs[adata.obs.run == 508].head()
Out[42]:
tags run cmo condition line replicate condition_clean index Consensus_gene_demux run_gene_demux
AAACCCAAGAGGTCGT-1-508 AAACCCAAGAGGTCGT-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGAGGTCGT-1 CTL08A 508
AAACCCAAGATTGGGC-1-508 AAACCCAAGATTGGGC-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGATTGGGC-1 doublet 508
AAACCCAAGCAATTAG-1-508 AAACCCAAGCAATTAG-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGCAATTAG-1 doublet 508
AAACCCAAGCTGCCAC-1-508 AAACCCAAGCTGCCAC-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGCTGCCAC-1 doublet 508
AAACCCAAGGATTCCT-1-508 AAACCCAAGGATTCCT-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGGATTCCT-1 CTL08A 508
In [43]:
mask = ~adata.obs['Consensus_gene_demux'].isna() & (adata.obs['Consensus_gene_demux'] != '')

adata = adata[mask, :]

print(f"Number of cells after filtering NaN: {adata.n_obs}")
Number of cells after filtering NaN: 621573
In [44]:
mask = adata.obs['Consensus_gene_demux'] != 'LowQuality'

adata = adata[mask, :]

print(f"Number of cells after filtering LowQuality: {adata.n_obs}")
Number of cells after filtering LowQuality: 605046
In [45]:
mask = adata.obs['Consensus_gene_demux'] != 'doublet'

adata = adata[mask, :]

print(f"Number of cells after filtering doublet: {adata.n_obs}")
Number of cells after filtering doublet: 494164
In [46]:
adata.obs.head()
Out[46]:
tags run cmo condition line replicate condition_clean index Consensus_gene_demux run_gene_demux
AAACCCAAGGACGCAT-1-501 AAACCCAAGGACGCAT-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST AAACCCAAGGACGCAT-1 CTL08A 501
AAACCCAAGTTGAAAC-1-501 AAACCCAAGTTGAAAC-1 501 305.0 THYROID_AGONIST CTL08A 2 THYROID_AGONIST AAACCCAAGTTGAAAC-1 CTL08A 501
AAACCCACAACGGGTA-1-501 AAACCCACAACGGGTA-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST AAACCCACAACGGGTA-1 CTL08A 501
AAACCCACAAGATCCT-1-501 AAACCCACAAGATCCT-1 501 303.0 THYROID_AGONIST CTL04E 3 THYROID_AGONIST AAACCCACAAGATCCT-1 CTL04E 501
AAACCCACAATCAAGA-1-501 AAACCCACAATCAAGA-1 501 301.0 THYROID_AGONIST CTL04E 1 THYROID_AGONIST AAACCCACAATCAAGA-1 CTL04E 501
In [47]:
print(sorted(list(adata.obs.run.unique())))
[393, 394, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513]
In [48]:
adata.obs.Consensus_gene_demux.unique()
Out[48]:
array(['CTL08A', 'CTL04E', 'CTL01A', 'CTL02A'], dtype=object)
In [49]:
# adata.write_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data_with_genetic_demux_filtered.h5ad'))

5. Check data consistency¶

In [128]:
# adata = ad.read_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data_with_genetic_demux_filtered.h5ad'))
In [143]:
adata[adata.obs['Consensus_gene_demux'].isin(['CTL01A', 'CTL02A'])].obs.head()
Out[143]:
tags run cmo condition line replicate condition_clean index Consensus_gene_demux run_gene_demux is_consistent
AAACCCAAGCACTCAT-1-393 AAACCCAAGCACTCAT-1 393 NaN MTR_393_3GEX NaN NaN NaN AAACCCAAGCACTCAT-1 CTL01A 393 True
AAACCCAAGCCTGCCA-1-393 AAACCCAAGCCTGCCA-1 393 NaN MTR_393_3GEX NaN NaN NaN AAACCCAAGCCTGCCA-1 CTL01A 393 True
AAACCCAAGGGATGTC-1-393 AAACCCAAGGGATGTC-1 393 NaN MTR_393_3GEX NaN NaN NaN AAACCCAAGGGATGTC-1 CTL01A 393 True
AAACCCACAATCGCAT-1-393 AAACCCACAATCGCAT-1 393 NaN MTR_393_3GEX NaN NaN NaN AAACCCACAATCGCAT-1 CTL02A 393 True
AAACCCACAGCTGAAG-1-393 AAACCCACAGCTGAAG-1 393 NaN MTR_393_3GEX NaN NaN NaN AAACCCACAGCTGAAG-1 CTL01A 393 True
In [144]:
adata[adata.obs.run == 508].obs.head()
Out[144]:
tags run cmo condition line replicate condition_clean index Consensus_gene_demux run_gene_demux is_consistent
AAACCCAAGAGGTCGT-1-508 AAACCCAAGAGGTCGT-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGAGGTCGT-1 CTL08A 508 True
AAACCCAAGGATTCCT-1-508 AAACCCAAGGATTCCT-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGGATTCCT-1 CTL08A 508 True
AAACCCAAGGGAACAA-1-508 AAACCCAAGGGAACAA-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGGGAACAA-1 CTL04E 508 True
AAACCCAAGGTTAAAC-1-508 AAACCCAAGGTTAAAC-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGGTTAAAC-1 CTL08A 508 True
AAACCCAAGTCGTTAC-1-508 AAACCCAAGTCGTTAC-1 508 NaN RET_AGONIST NaN NaN RET_AGONIST AAACCCAAGTCGTTAC-1 CTL04E 508 True
In [59]:
def check_consistency(df):
    # Create a mask for rows to include in the consistency check
    include_mask = ~df.run.isin([508,393,394])
    
    run_consistent = (df['run'] == df['run_gene_demux'])

    tag_consistent = (df['tags'] == df['index'])
    
    # Convert categorical columns to string for comparison
    line_str = df['line'].astype(str)
    consensus_str = df['Consensus_gene_demux'].astype(str)
    
    line_consistent = ((line_str == consensus_str) | ~include_mask)
    
    return run_consistent & line_consistent & tag_consistent
In [60]:
# consistency check
adata.obs['is_consistent'] = check_consistency(adata.obs)

# Count inconsistencies and calculate consistency percentage
total_count = len(adata.obs)
included_count = (~adata.obs.run.isin([508,393,394])).sum()
inconsistent_count = (~adata.obs['is_consistent']).sum()
consistency_percentage = (adata.obs['is_consistent'].sum()/ total_count) * 100

print(f"Total rows: {total_count}")
print(f"Rows included in consistency check: {included_count}")
print(f"Inconsistent rows: {inconsistent_count}")
print(f"Consistency percentage: {consistency_percentage:.2f}%")
Total rows: 494164
Rows included in consistency check: 409234
Inconsistent rows: 13664
Consistency percentage: 97.23%
In [61]:
none_count = adata.obs['is_consistent'].isna().sum()
print(f"Number of None values remaining: {none_count}")
Number of None values remaining: 0
In [62]:
adata.obs['is_consistent'] = adata.obs['is_consistent'].fillna(True)
In [63]:
adata.obs[~adata.obs['is_consistent']].head()
Out[63]:
tags run cmo condition line replicate condition_clean index Consensus_gene_demux run_gene_demux is_consistent
AAACGAAAGGGAGGGT-1-501 AAACGAAAGGGAGGGT-1 501 302.0 THYROID_AGONIST CTL04E 2 THYROID_AGONIST AAACGAAAGGGAGGGT-1 CTL08A 501 False
AAACGAACAGCGAGTA-1-501 AAACGAACAGCGAGTA-1 501 303.0 THYROID_AGONIST CTL04E 3 THYROID_AGONIST AAACGAACAGCGAGTA-1 CTL08A 501 False
AAACGCTTCCCAGTGG-1-501 AAACGCTTCCCAGTGG-1 501 305.0 THYROID_AGONIST CTL08A 2 THYROID_AGONIST AAACGCTTCCCAGTGG-1 CTL04E 501 False
AAAGGGCGTACGGGAT-1-501 AAAGGGCGTACGGGAT-1 501 305.0 THYROID_AGONIST CTL08A 2 THYROID_AGONIST AAAGGGCGTACGGGAT-1 CTL04E 501 False
AAAGTCCCAAAGACTA-1-501 AAAGTCCCAAAGACTA-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST AAAGTCCCAAAGACTA-1 CTL04E 501 False
In [64]:
print(adata.obs.Consensus_gene_demux.unique())
print(adata.obs.line.unique())
['CTL08A' 'CTL04E' 'CTL01A' 'CTL02A']
['CTL08A', 'CTL04E', NaN]
Categories (2, object): ['CTL04E', 'CTL08A']
In [80]:
if inconsistent_count > 0:
    # Analyze types of inconsistencies
    run_mismatch = adata.obs[adata.obs['run'] != adata.obs['run_gene_demux']]

    tag_mismatch = adata.obs[adata.obs['tags'] != adata.obs['index']]
    
    # Convert categorical columns to string for comparison
    line_str = adata.obs['line'].astype(str)
    consensus_str = adata.obs['Consensus_gene_demux'].astype(str)
    
    # Exclude 'CTL01A' and 'CTL02A' from line mismatch check
    line_mismatch = adata.obs[
        (line_str != consensus_str) & 
        ~(adata.obs['Consensus_gene_demux'].isin(['CTL01A', 'CTL02A']) | (adata.obs.run == 508))
    ]
    
    print(f"Rows with mismatched run: {len(run_mismatch)}")
    print(f"Rows with mismatched tags: {len(tag_mismatch)}")
    print(f"Rows with mismatched line/Consensus: {len(line_mismatch)}")
Rows with mismatched run: 0
Rows with mismatched tags: 0
Rows with mismatched line/Consensus: 47480
In [81]:
n_cells_before = adata.n_obs

# Filter the AnnData object to keep only consistent records
adata_filtered = adata[adata.obs['is_consistent']]

n_cells_after = adata_filtered.n_obs

print(f"Number of cells before filtering: {n_cells_before}")
print(f"Number of cells after filtering: {n_cells_after}")
print(f"Number of cells removed: {n_cells_before - n_cells_after}")
print(f"Percentage of cells kept: {(n_cells_after / n_cells_before) * 100:.2f}%")
Number of cells before filtering: 494164
Number of cells after filtering: 480500
Number of cells removed: 13664
Percentage of cells kept: 97.23%
In [82]:
adata_filtered.obs.head()
Out[82]:
tags run cmo condition line replicate condition_clean index Consensus_gene_demux run_gene_demux is_consistent
AAACCCAAGGACGCAT-1-501 AAACCCAAGGACGCAT-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST AAACCCAAGGACGCAT-1 CTL08A 501 True
AAACCCAAGTTGAAAC-1-501 AAACCCAAGTTGAAAC-1 501 305.0 THYROID_AGONIST CTL08A 2 THYROID_AGONIST AAACCCAAGTTGAAAC-1 CTL08A 501 True
AAACCCACAACGGGTA-1-501 AAACCCACAACGGGTA-1 501 306.0 THYROID_AGONIST CTL08A 3 THYROID_AGONIST AAACCCACAACGGGTA-1 CTL08A 501 True
AAACCCACAAGATCCT-1-501 AAACCCACAAGATCCT-1 501 303.0 THYROID_AGONIST CTL04E 3 THYROID_AGONIST AAACCCACAAGATCCT-1 CTL04E 501 True
AAACCCACAATCAAGA-1-501 AAACCCACAATCAAGA-1 501 301.0 THYROID_AGONIST CTL04E 1 THYROID_AGONIST AAACCCACAATCAAGA-1 CTL04E 501 True
In [83]:
adata_filtered.obs = adata_filtered.obs.drop(columns=['is_consistent'])
adata_filtered.obs = adata_filtered.obs.drop(columns=['index'])
adata_filtered.obs = adata_filtered.obs.drop(columns=['run'])
adata_filtered.obs = adata_filtered.obs.drop(columns=['line'])
In [84]:
adata_filtered.obs.head()
Out[84]:
tags cmo condition replicate condition_clean Consensus_gene_demux run_gene_demux
AAACCCAAGGACGCAT-1-501 AAACCCAAGGACGCAT-1 306.0 THYROID_AGONIST 3 THYROID_AGONIST CTL08A 501
AAACCCAAGTTGAAAC-1-501 AAACCCAAGTTGAAAC-1 305.0 THYROID_AGONIST 2 THYROID_AGONIST CTL08A 501
AAACCCACAACGGGTA-1-501 AAACCCACAACGGGTA-1 306.0 THYROID_AGONIST 3 THYROID_AGONIST CTL08A 501
AAACCCACAAGATCCT-1-501 AAACCCACAAGATCCT-1 303.0 THYROID_AGONIST 3 THYROID_AGONIST CTL04E 501
AAACCCACAATCAAGA-1-501 AAACCCACAATCAAGA-1 301.0 THYROID_AGONIST 1 THYROID_AGONIST CTL04E 501
In [85]:
adata_filtered.obs = adata_filtered.obs.rename(columns={
    'Consensus_gene_demux': 'line', 
    'run_gene_demux': 'run'
    })
In [86]:
adata_filtered.obs['cmo'] = pd.to_numeric(adata_filtered.obs['cmo'], errors='coerce')
    
# Replace NaN with the specified value
adata_filtered.obs['cmo'] = adata_filtered.obs['cmo'].fillna(-1)

# Convert to integer
adata_filtered.obs['cmo'] = adata_filtered.obs['cmo'].astype(int)

# Print some information about the conversion
print(f"Unique values in 'cmo' after conversion: {adata_filtered.obs['cmo'].unique()}")
print(f"Number of -1 values (originally NaN): {(adata_filtered.obs['cmo'] == -1).sum()}")
Unique values in 'cmo' after conversion: [306 305 303 301 302 304 312 310 309 308 307 311  -1]
Number of -1 values (originally NaN): 84930
In [87]:
adata_filtered[adata_filtered.obs.cmo==-1].obs.head()
Out[87]:
tags cmo condition replicate condition_clean line run
AAACCCAAGAGGTCGT-1-508 AAACCCAAGAGGTCGT-1 -1 RET_AGONIST NaN RET_AGONIST CTL08A 508
AAACCCAAGGATTCCT-1-508 AAACCCAAGGATTCCT-1 -1 RET_AGONIST NaN RET_AGONIST CTL08A 508
AAACCCAAGGGAACAA-1-508 AAACCCAAGGGAACAA-1 -1 RET_AGONIST NaN RET_AGONIST CTL04E 508
AAACCCAAGGTTAAAC-1-508 AAACCCAAGGTTAAAC-1 -1 RET_AGONIST NaN RET_AGONIST CTL08A 508
AAACCCAAGTCGTTAC-1-508 AAACCCAAGTCGTTAC-1 -1 RET_AGONIST NaN RET_AGONIST CTL04E 508
In [88]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes = axes.flatten()

columns = ['run', 'line', 'condition', 'replicate']

for ax, column in zip(axes, columns):
    adata_filtered.obs[column].value_counts().plot.bar(ax=ax, color='gray')
    ax.set_title(column)
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()
In [89]:
adata_filtered
Out[89]:
AnnData object with n_obs × n_vars = 480500 × 36601
    obs: 'tags', 'cmo', 'condition', 'replicate', 'condition_clean', 'line', 'run'
    var: 'gene_ids', 'feature_types', 'genome'
In [90]:
adata_filtered.write_h5ad('./adatas/combined_scRNA_seq_data_with_genetic_demux_filtered_consistent.h5ad')