##############
## input: input-run-MTR_*.csv
## output: combined_scRNA_seq_data.h5ad, combined_scRNA_seq_data_39X.h5ad
##############
import os
import re
import csv
import glob
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from functools import partial
from concurrent.futures import ProcessPoolExecutor, as_completed
output_dir = "/group/testa/Project/EndPoints/scRNASeq/preAnalysis/output"
meta_dir = "/group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi"
data_dir = "/group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/filtered_matrix"
classification_dir = '/group/testa/Project/EndPoints/2024_singleCell/multiseq/classification'
cmo_dir = "/group/testa/Project/EndPoints/scRNASeq/preAnalysis/"
def load_csv_files(classification_dir):
# Get a list of all CSV files in the directory
csv_files = glob.glob(os.path.join(classification_dir, '*_multiseq_class.csv'))
dataframes = {}
for file in csv_files:
# Extract the file name without extension
file_name = os.path.basename(file).split('.')[0]
# Extract the run number
run_number = file_name.split('_')[0]
# Create the run name (e.g., run_491)
run_name = f"run_{run_number}"
df = pd.read_csv(file, sep='\t')
dataframes[run_name] = df
return dataframes
def extract_condition_type(file_path):
condition = []
line = []
replicate = []
cmo = []
with open(file_path, 'r') as file:
csv_reader = csv.reader(file)
in_samples_section = False
for row in csv_reader:
if row and row[0] == 'sample_id':
in_samples_section = True
continue
if in_samples_section and row:
condition_and_line_and_replicate = row[0]
parts = condition_and_line_and_replicate.split('_')
condition.append('_'.join(parts[:-2]))
line.append(parts[-2])
replicate.append(parts[-1])
cmo.append(row[1])
return [condition, line, replicate, cmo]
def process_run(run_dir):
try:
adata = sc.read_h5ad(os.path.join(run_dir, 'matrix.h5ad'))
run_name = os.path.basename(run_dir).split('_')[-1]
adata.obs["run"] = run_name
adata.obs = adata.obs.reset_index().rename(columns={'index': 'tags'})
adata.obs['index'] = adata.obs['tags'].astype(str) + '-' + adata.obs['run']
adata.obs.set_index('index', inplace=True)
adata.var_names_make_unique()
adata.obs_names_make_unique()
print(f"Processed run: {run_name}, Shape: {adata.shape}")
return adata
except Exception as e:
print(f"Error processing {run_dir}: {str(e)}")
return None
def process_runs(run_dirs, max_workers=None):
all_data = []
with ProcessPoolExecutor(max_workers=max_workers) as executor:
future_to_run = {executor.submit(process_run, run_dir): run_dir for run_dir in run_dirs}
for future in as_completed(future_to_run):
run_dir = future_to_run[future]
try:
adata = future.result()
if adata is not None:
all_data.append(adata)
except Exception as e:
print(f"Error processing {run_dir}: {str(e)}")
print(f"Number of processed runs: {len(all_data)}")
return all_data
final_call | RescuedStatus | CMO | tags | run | |
---|---|---|---|---|---|
AAACCCAAGAAACCCG-1 | CMO303 | Not_Resc | 303 | AAACCCAAGAAACCCG-1 | 501 |
AAACCCAAGGACGCAT-1 | CMO306 | Not_Resc | 306 | AAACCCAAGGACGCAT-1 | 501 |
AAACCCAAGTTGAAAC-1 | CMO305 | Not_Resc | 305 | AAACCCAAGTTGAAAC-1 | 501 |
AAACCCACAACGGGTA-1 | CMO306 | Not_Resc | 306 | AAACCCACAACGGGTA-1 | 501 |
AAACCCACAAGATCCT-1 | CMO303 | Not_Resc | 303 | AAACCCACAAGATCCT-1 | 501 |
classification_data = load_csv_files(classification_dir)
print(sorted(list(classification_data.keys())))
['run_491', 'run_492', 'run_493', 'run_494', 'run_495', 'run_496', 'run_497', 'run_498', 'run_499', 'run_500', 'run_501', 'run_502', 'run_503', 'run_504', 'run_505', 'run_506', 'run_507', 'run_509', 'run_510', 'run_511', 'run_512', 'run_513']
classification_data["run_501"].head()
final.calls.rescued | RescuedStatus | |
---|---|---|
AAACCCAAGAAACCCG-1 | CMO303 | Not_Resc |
AAACCCAAGAACTGAT-1 | Doublet | Not_Resc |
AAACCCAAGCGCCGTT-1 | Doublet | Not_Resc |
AAACCCAAGGACGCAT-1 | CMO306 | Not_Resc |
AAACCCAAGTACGAGC-1 | Doublet | Not_Resc |
for run_name, df in classification_data.items():
if 'final.calls.rescued' in df.columns:
df = df.rename(columns={'final.calls.rescued': 'final_call'})
df = df[~df.final_call.isin(['Doublet' , 'Negative'])]
df['CMO'] = df['final_call'].str.extract('CMO(\d+)')
df['CMO'] = pd.to_numeric(df['CMO'], errors='coerce').astype('Int64')
df['tags'] = df.index
df['run'] = run_name.split('_')[-1]
classification_data[run_name] = df
else:
print(f"Warning: 'final_call' column not found in DataFrame for {run_name}")
Warning: 'final_call' column not found in DataFrame for run_509
classification_data["run_509"].head()
x | |
---|---|
AAACCCAAGACACACG-1 | Doublet |
AAACCCAAGATTGGGC-1 | CMO306 |
AAACCCAAGCACACAG-1 | CMO303 |
AAACCCAAGCCGCTTG-1 | CMO303 |
AAACCCAAGCCTATTG-1 | CMO302 |
to_fix = ["run_509"]
for run_name in to_fix:
print(run_name)
df = classification_data[run_name]
if 'x' in df.columns:
df = df.rename(columns={'x': 'final_call'})
df = df[~df.final_call.isin(['Doublet' , 'Negative'])]
df['CMO'] = df['final_call'].str.extract('CMO(\d+)')
df['CMO'] = pd.to_numeric(df['CMO'], errors='coerce').astype('Int64')
df['tags'] = df.index
df['run'] = run_name.split('_')[-1]
classification_data[run_name] = df
else:
print(f"Warning: 'final_call' column not found in DataFrame for {run_name}")
run_509
classification_data["run_509"].head()
# del classification_data["run_509"]
final_call | CMO | tags | run | |
---|---|---|---|---|
AAACCCAAGATTGGGC-1 | CMO306 | 306 | AAACCCAAGATTGGGC-1 | 509 |
AAACCCAAGCACACAG-1 | CMO303 | 303 | AAACCCAAGCACACAG-1 | 509 |
AAACCCAAGCCGCTTG-1 | CMO303 | 303 | AAACCCAAGCCGCTTG-1 | 509 |
AAACCCAAGCCTATTG-1 | CMO302 | 302 | AAACCCAAGCCTATTG-1 | 509 |
AAACCCAAGGACAAGA-1 | CMO304 | 304 | AAACCCAAGGACAAGA-1 | 509 |
classification_data["run_501"].head()
final_call | RescuedStatus | CMO | tags | run | |
---|---|---|---|---|---|
AAACCCAAGAAACCCG-1 | CMO303 | Not_Resc | 303 | AAACCCAAGAAACCCG-1 | 501 |
AAACCCAAGGACGCAT-1 | CMO306 | Not_Resc | 306 | AAACCCAAGGACGCAT-1 | 501 |
AAACCCAAGTTGAAAC-1 | CMO305 | Not_Resc | 305 | AAACCCAAGTTGAAAC-1 | 501 |
AAACCCACAACGGGTA-1 | CMO306 | Not_Resc | 306 | AAACCCACAACGGGTA-1 | 501 |
AAACCCACAAGATCCT-1 | CMO303 | Not_Resc | 303 | AAACCCACAAGATCCT-1 | 501 |
classification_data = pd.concat(classification_data.values(), keys=classification_data.keys(), ignore_index=False)
classification_data = classification_data.reset_index(drop=True)
classification_data.head()
final_call | RescuedStatus | CMO | tags | run | |
---|---|---|---|---|---|
0 | CMO309 | Not_Resc | 309 | AAACCCAAGCTCAGAG-1 | 510 |
1 | CMO310 | Not_Resc | 310 | AAACCCAAGGAGCTGT-1 | 510 |
2 | CMO312 | Not_Resc | 312 | AAACCCAAGGCACAAC-1 | 510 |
3 | CMO309 | Not_Resc | 309 | AAACCCAAGGGTTGCA-1 | 510 |
4 | CMO309 | Not_Resc | 309 | AAACCCACAAGTGGAC-1 | 510 |
classification_data = classification_data.rename(columns={'CONDITION': 'condition', 'CELL LINE': 'line', 'CMO': 'cmo'})
classification_data['run'] = pd.to_numeric(classification_data['run'], errors='coerce').astype('Int64')
classification_data.shape
(492061, 5)
extracted_runs = sorted(list(classification_data.run.unique()))
print(extracted_runs)
valid_range = list(range(491, 514))
print(valid_range)
[491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 509, 510, 511, 512, 513] [491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513]
sorted_runs = sorted(classification_data['run'].unique())
classification_data['run'].value_counts().reindex(sorted_runs).plot.bar(color=['#279e68', '#d62728', '#ff7f0e', '#1f77b4'])
<Axes: xlabel='run'>
classification_data.head()
final_call | RescuedStatus | cmo | tags | run | |
---|---|---|---|---|---|
0 | CMO309 | Not_Resc | 309 | AAACCCAAGCTCAGAG-1 | 510 |
1 | CMO310 | Not_Resc | 310 | AAACCCAAGGAGCTGT-1 | 510 |
2 | CMO312 | Not_Resc | 312 | AAACCCAAGGCACAAC-1 | 510 |
3 | CMO309 | Not_Resc | 309 | AAACCCAAGGGTTGCA-1 | 510 |
4 | CMO309 | Not_Resc | 309 | AAACCCACAAGTGGAC-1 | 510 |
classification_data['unique_index'] = classification_data['tags'].astype(str) + '-' + classification_data['run'].astype(str)
classification_data = classification_data.set_index('unique_index')
if classification_data.index.name != 'unique_index':
classification_data = classification_data.set_index('unique_index')
classification_data = classification_data.reset_index()
classification_data.head()
unique_index | final_call | RescuedStatus | cmo | tags | run | |
---|---|---|---|---|---|---|
0 | AAACCCAAGCTCAGAG-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGCTCAGAG-1 | 510 |
1 | AAACCCAAGGAGCTGT-1-510 | CMO310 | Not_Resc | 310 | AAACCCAAGGAGCTGT-1 | 510 |
2 | AAACCCAAGGCACAAC-1-510 | CMO312 | Not_Resc | 312 | AAACCCAAGGCACAAC-1 | 510 |
3 | AAACCCAAGGGTTGCA-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGGGTTGCA-1 | 510 |
4 | AAACCCACAAGTGGAC-1-510 | CMO309 | Not_Resc | 309 | AAACCCACAAGTGGAC-1 | 510 |
if classification_data.index.duplicated().any():
duplicate_count = classification_data.index.duplicated().sum()
print(f"Warning: {duplicate_count} duplicate indices found in classification_data.")
# classification_data = classification_data[~classification_data.index.duplicated(keep='first')]
# classification_data = classification_data[~classification_data.index.duplicated(keep=False)]
# print(f"Removed {duplicate_count} rows. New shape of classification_data: {classification_data.shape}")
CONDITION | CELL LINE | MIX Name | CMO | |
---|---|---|---|---|
0 | ESTROGEN AGONIST | CTL04E | MIX 1 | 301 |
1 | ESTROGEN AGONIST | CTL04E | MIX 1 | 302 |
2 | ESTROGEN AGONIST | CTL04E | MIX 1 | 303 |
3 | ESTROGEN AGONIST | CTL08A | MIX 1 | 304 |
4 | ESTROGEN AGONIST | CTL08A | MIX 1 | 305 |
cmo_file_path = os.path.join(cmo_dir, "single_cell_CMO_sheet.xlsx")
cmo_mapping = pd.read_excel(cmo_file_path)
cmo_mapping['CMO'] = pd.to_numeric(cmo_mapping['CMO'], errors='coerce').astype('Int64')
cmo_mapping.head()
CONDITION | CELL LINE | MIX Name | CMO | |
---|---|---|---|---|
0 | ESTROGEN AGONIST | CTL04E | MIX 1 | 301 |
1 | ESTROGEN AGONIST | CTL04E | MIX 1 | 302 |
2 | ESTROGEN AGONIST | CTL04E | MIX 1 | 303 |
3 | ESTROGEN AGONIST | CTL08A | MIX 1 | 304 |
4 | ESTROGEN AGONIST | CTL08A | MIX 1 | 305 |
cmo_mapping["MIX Name"].unique()
cmo_mapping["CMO"].unique()
<IntegerArray> [301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, <NA>] Length: 13, dtype: Int64
all_data = []
# Find all run directories
meta_dirs = glob.glob(os.path.join(meta_dir, 'input-run-MTR_*.csv'))
print(sorted([file.split('/')[-1].split('-')[-1] for file in meta_dirs]))
['MTR_491.csv', 'MTR_492.csv', 'MTR_493.csv', 'MTR_494.csv', 'MTR_495.csv', 'MTR_496.csv', 'MTR_497.csv', 'MTR_498.csv', 'MTR_499.csv', 'MTR_500.csv', 'MTR_501.csv', 'MTR_502.csv', 'MTR_503.csv', 'MTR_504.csv', 'MTR_505.csv', 'MTR_506.csv', 'MTR_507.csv', 'MTR_508.csv', 'MTR_509.csv', 'MTR_510.csv', 'MTR_511.csv', 'MTR_512.csv', 'MTR_513.csv']
import pandas as pd
run_condition = pd.DataFrame(columns=["run", "condition", "line", "replicate", "cmo"])
for meta_dir in meta_dirs:
print(f"Processing file: {meta_dir}")
run = meta_dir.split('_')[-1].split('.')[0]
print(f"Run: {run}")
# Read metadata and extract condition
meta_data = extract_condition_type(meta_dir)
print(f"Meta data: {meta_data}")
# Ensure meta_data is a list of lists
if isinstance(meta_data[0], str):
conditions, lines, replicates, cmos = [meta_data[0]], [meta_data[1]], [meta_data[2]], [meta_data[3]]
else:
conditions, lines, replicates, cmos = meta_data
if not conditions:
print("Warning: No condition per run")
continue
elif len(list(dict.fromkeys(conditions))) > 1:
print("Warning: More than one condition per run")
conditions = [None] * len(conditions)
for c, l, r, cm in zip(conditions, lines, replicates, cmos):
new_row = pd.DataFrame({"run": [run], "condition": [c], "line": [l], "replicate": [r], "cmo": [cm]})
run_condition = pd.concat([run_condition, new_row], ignore_index=True)
print(run_condition)
Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_493.csv Run: 493 Meta data: [['ANDROGEN_AGONIST', 'ANDROGEN_AGONIST', 'ANDROGEN_AGONIST', 'ANDROGEN_AGONIST', 'ANDROGEN_AGONIST', 'ANDROGEN_AGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_507.csv Run: 507 Meta data: [['MEP', 'MEP', 'MEP', 'MEP', 'MEP', 'MEP'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_491.csv Run: 491 Meta data: [['ESTROGEN_AGONIST', 'ESTROGEN_AGONIST', 'ESTROGEN_AGONIST', 'ESTROGEN_AGONIST', 'ESTROGEN_AGONIST', 'ESTROGEN_AGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_498.csv Run: 498 Meta data: [['LIVER-X_ANTAGONIST', 'LIVER-X_ANTAGONIST', 'LIVER-X_ANTAGONIST', 'LIVER-X_ANTAGONIST', 'LIVER-X_ANTAGONIST', 'LIVER-X_ANTAGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_492.csv Run: 492 Meta data: [['ESTROGEN_ANTAGONIST', 'ESTROGEN_ANTAGONIST', 'ESTROGEN_ANTAGONIST', 'ESTROGEN_ANTAGONIST', 'ESTROGEN_ANTAGONIST', 'ESTROGEN_ANTAGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_511.csv Run: 511 Meta data: [['MBzP', 'MBzP', 'MBzP', 'MBzP', 'MBzP', 'MBzP'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_510.csv Run: 510 Meta data: [['BPF', 'BPF', 'BPF', 'BPF', 'BPF', 'BPF'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_497.csv Run: 497 Meta data: [['LIVER-X_AGONIST', 'LIVER-X_AGONIST', 'LIVER-X_AGONIST', 'LIVER-X_AGONIST', 'LIVER-X_AGONIST', 'LIVER-X_AGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_512.csv Run: 512 Meta data: [['TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_509.csv Run: 509 Meta data: [['RET_ANTAGONIST', 'RET_ANTAGONIST', 'RET_ANTAGONIST', 'RET_ANTAGONIST', 'RET_ANTAGONIST', 'RET_ANTAGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_502.csv Run: 502 Meta data: [['THYROID_ANTAGONIST', 'THYROID_ANTAGONIST', 'THYROID_ANTAGONIST', 'THYROID_ANTAGONIST', 'THYROID_ANTAGONIST', 'THYROID_ANTAGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_496.csv Run: 496 Meta data: [['ARYL_HYD_ANTAGONIST', 'ARYL_HYD_ANTAGONIST', 'ARYL_HYD_ANTAGONIST', 'ARYL_HYD_ANTAGONIST', 'ARYL_HYD_ANTAGONIST', 'ARYL_HYD_ANTAGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_500.csv Run: 500 Meta data: [['GLUCOCORT_ANTAGONIST', 'GLUCOCORT_ANTAGONIST', 'GLUCOCORT_ANTAGONIST', 'GLUCOCORT_ANTAGONIST', 'GLUCOCORT_ANTAGONIST', 'GLUCOCORT_ANTAGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_495.csv Run: 495 Meta data: [['ARYL_HYD_AGONIST', 'ARYL_HYD_AGONIST', 'ARYL_HYD_AGONIST', 'ARYL_HYD_AGONIST', 'ARYL_HYD_AGONIST', 'ARYL_HYD_AGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_506.csv Run: 506 Meta data: [['BPA', 'BPA', 'BPA', 'BPA', 'BPA', 'BPA'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_503.csv Run: 503 Meta data: [['3-PBA', '3-PBA', '3-PBA', '3-PBA', '3-PBA', '3-PBA'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_508.csv Run: 508 Meta data: [[], [], [], []] Warning: No condition per run Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_499.csv Run: 499 Meta data: [['GLUCOCORT_AGONIST', 'GLUCOCORT_AGONIST', 'GLUCOCORT_AGONIST', 'GLUCOCORT_AGONIST', 'GLUCOCORT_AGONIST', 'GLUCOCORT_AGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_501.csv Run: 501 Meta data: [['THYROID_AGONIST', 'THYROID_AGONIST', 'THYROID_AGONIST', 'THYROID_AGONIST', 'THYROID_AGONIST', 'THYROID_AGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_505.csv Run: 505 Meta data: [['DMSO', 'DMSO', 'DMSO', 'DMSO', 'DMSO', 'DMSO'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_504.csv Run: 504 Meta data: [['DPHP', 'DPHP', 'DPHP', 'DPHP', 'DPHP', 'DPHP'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_513.csv Run: 513 Meta data: [['MIX', 'MIX', 'MIX', 'MIX', 'MIX', 'MIX'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO307', 'CMO308', 'CMO309', 'CMO310', 'CMO311', 'CMO312']] Processing file: /group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/input-run-MTR_494.csv Run: 494 Meta data: [['ANDROGEN_ANTAGONIST', 'ANDROGEN_ANTAGONIST', 'ANDROGEN_ANTAGONIST', 'ANDROGEN_ANTAGONIST', 'ANDROGEN_ANTAGONIST', 'ANDROGEN_ANTAGONIST'], ['CTL04E', 'CTL04E', 'CTL04E', 'CTL08A', 'CTL08A', 'CTL08A'], ['1', '2', '3', '1', '2', '3'], ['CMO301', 'CMO302', 'CMO303', 'CMO304', 'CMO305', 'CMO306']] run condition line replicate cmo 0 493 ANDROGEN_AGONIST CTL04E 1 CMO301 1 493 ANDROGEN_AGONIST CTL04E 2 CMO302 2 493 ANDROGEN_AGONIST CTL04E 3 CMO303 3 493 ANDROGEN_AGONIST CTL08A 1 CMO304 4 493 ANDROGEN_AGONIST CTL08A 2 CMO305 .. ... ... ... ... ... 127 494 ANDROGEN_ANTAGONIST CTL04E 2 CMO302 128 494 ANDROGEN_ANTAGONIST CTL04E 3 CMO303 129 494 ANDROGEN_ANTAGONIST CTL08A 1 CMO304 130 494 ANDROGEN_ANTAGONIST CTL08A 2 CMO305 131 494 ANDROGEN_ANTAGONIST CTL08A 3 CMO306 [132 rows x 5 columns]
run_condition.head()
run | condition | line | replicate | cmo | |
---|---|---|---|---|---|
0 | 493 | ANDROGEN_AGONIST | CTL04E | 1 | CMO301 |
1 | 493 | ANDROGEN_AGONIST | CTL04E | 2 | CMO302 |
2 | 493 | ANDROGEN_AGONIST | CTL04E | 3 | CMO303 |
3 | 493 | ANDROGEN_AGONIST | CTL08A | 1 | CMO304 |
4 | 493 | ANDROGEN_AGONIST | CTL08A | 2 | CMO305 |
run_condition['cmo'] = run_condition['cmo'].str.extract('(\d+)').astype(int)
run_condition.head()
run | condition | line | replicate | cmo | |
---|---|---|---|---|---|
0 | 493 | ANDROGEN_AGONIST | CTL04E | 1 | 301 |
1 | 493 | ANDROGEN_AGONIST | CTL04E | 2 | 302 |
2 | 493 | ANDROGEN_AGONIST | CTL04E | 3 | 303 |
3 | 493 | ANDROGEN_AGONIST | CTL08A | 1 | 304 |
4 | 493 | ANDROGEN_AGONIST | CTL08A | 2 | 305 |
run_condition.isnull().values.any()
False
print(sorted(run_condition.run.unique()))
['491', '492', '493', '494', '495', '496', '497', '498', '499', '500', '501', '502', '503', '504', '505', '506', '507', '509', '510', '511', '512', '513']
run_condition.shape
(132, 5)
run_condition['run'] = pd.to_numeric(run_condition['run'], errors='coerce').astype('Int64')
extracted_runs = sorted(list(run_condition.run.unique()))
valid_range = list(range(491, 514))
missing_runs = [run for run in valid_range if run not in extracted_runs]
print(f"Missing runs: {missing_runs}")
Missing runs: [508]
classification_data['cmo'] = classification_data['cmo'].astype(int)
run_condition['cmo'] = run_condition['cmo'].astype(int)
classification_data['run'] = classification_data['run'].astype(int)
run_condition['run'] = run_condition['run'].astype(int)
# Create lists of tuples from run_condition and classification_data
run_condition_list = list(run_condition[['run', 'cmo']].itertuples(index=False, name=None))
classification_data_list = list(classification_data[['run', 'cmo']].itertuples(index=False, name=None))
# Find inconsistencies using a list comprehension
inconsistencies = [item for item in classification_data_list if item not in run_condition_list]
if inconsistencies:
print("Inconsistencies found:")
for inc in inconsistencies:
print(f"run: {inc[0]}, CMO: {inc[1]}, Condition: {inc[2]}, Cell Line: {inc[3]}")
else:
print("No inconsistencies found. All triples are consistent between the dataframes.")
No inconsistencies found. All triples are consistent between the dataframes.
# Check for non-unique pairs in run_condition
run_condition_duplicates = run_condition[run_condition[['run', 'cmo']].duplicated()]
if not run_condition_duplicates.empty:
print("\nNon-unique pairs found in run_condition:")
print(run_condition_duplicates)
else:
print("\nAll pairs in run_condition are unique.")
All pairs in run_condition are unique.
# Additional check: pairs in classification_data not in run_condition
missing_pairs = [item for item in classification_data_list if item not in run_condition_list]
if missing_pairs:
print("\nPairs in classification_data not found in run_condition:")
for pair in missing_pairs:
print(f"Run: {pair[0]}, CMO: {pair[1]}")
else:
print("\nAll pairs in classification_data are present in run_condition.")
# Additional check: pairs in run_condition not in classification_data
extra_pairs = [item for item in run_condition_list if item not in classification_data_list]
if extra_pairs:
print("\nPairs in run_condition not found in classification_data:")
for pair in extra_pairs:
print(f"Run: {pair[0]}, CMO: {pair[1]}")
else:
print("\nAll pairs in run_condition are present in classification_data.")
All pairs in classification_data are present in run_condition. Pairs in run_condition not found in classification_data: Run: 509, CMO: 305
extra_pairs
[(509, 305)]
print([item for item in run_condition_list if item not in classification_data_list])
[(509, 305)]
print(run_condition.shape)
run_condition = run_condition[~((run_condition.run == 509) & (run_condition.cmo == 305))]
print(run_condition.shape)
(132, 5) (131, 5)
run_condition['run_cmo'] = run_condition['run'].astype(str) + run_condition['cmo'].astype(str)
classification_data['run_cmo'] = classification_data['run'].astype(str) + classification_data['cmo'].astype(str)
run_condition.head()
run | condition | line | replicate | cmo | run_cmo | |
---|---|---|---|---|---|---|
0 | 493 | ANDROGEN_AGONIST | CTL04E | 1 | 301 | 493301 |
1 | 493 | ANDROGEN_AGONIST | CTL04E | 2 | 302 | 493302 |
2 | 493 | ANDROGEN_AGONIST | CTL04E | 3 | 303 | 493303 |
3 | 493 | ANDROGEN_AGONIST | CTL08A | 1 | 304 | 493304 |
4 | 493 | ANDROGEN_AGONIST | CTL08A | 2 | 305 | 493305 |
classification_data.head()
unique_index | final_call | RescuedStatus | cmo | tags | run | run_cmo | |
---|---|---|---|---|---|---|---|
0 | AAACCCAAGCTCAGAG-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGCTCAGAG-1 | 510 | 510309 |
1 | AAACCCAAGGAGCTGT-1-510 | CMO310 | Not_Resc | 310 | AAACCCAAGGAGCTGT-1 | 510 | 510310 |
2 | AAACCCAAGGCACAAC-1-510 | CMO312 | Not_Resc | 312 | AAACCCAAGGCACAAC-1 | 510 | 510312 |
3 | AAACCCAAGGGTTGCA-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGGGTTGCA-1 | 510 | 510309 |
4 | AAACCCACAAGTGGAC-1-510 | CMO309 | Not_Resc | 309 | AAACCCACAAGTGGAC-1 | 510 | 510309 |
classification_data['run_cmo'] = classification_data['run_cmo'].astype(int)
run_condition['run_cmo'] = run_condition['run_cmo'].astype(int)
classification_data_combined = classification_data.merge(
run_condition[['run_cmo', 'condition', 'line', 'replicate']],
on='run_cmo',
how='left'
)
classification_data_combined.shape
(492061, 10)
classification_data_combined.head()
unique_index | final_call | RescuedStatus | cmo | tags | run | run_cmo | condition | line | replicate | |
---|---|---|---|---|---|---|---|---|---|---|
0 | AAACCCAAGCTCAGAG-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGCTCAGAG-1 | 510 | 510309 | BPF | CTL04E | 3 |
1 | AAACCCAAGGAGCTGT-1-510 | CMO310 | Not_Resc | 310 | AAACCCAAGGAGCTGT-1 | 510 | 510310 | BPF | CTL08A | 1 |
2 | AAACCCAAGGCACAAC-1-510 | CMO312 | Not_Resc | 312 | AAACCCAAGGCACAAC-1 | 510 | 510312 | BPF | CTL08A | 3 |
3 | AAACCCAAGGGTTGCA-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGGGTTGCA-1 | 510 | 510309 | BPF | CTL04E | 3 |
4 | AAACCCACAAGTGGAC-1-510 | CMO309 | Not_Resc | 309 | AAACCCACAAGTGGAC-1 | 510 | 510309 | BPF | CTL04E | 3 |
# Check for any rows where the merge didn't find a match
unmatched_rows = classification_data_combined[classification_data_combined['condition'].isna()]
if not unmatched_rows.empty:
print("Warning: Some rows in classification_data_combined didn't find a match in run_condition:")
print(unmatched_rows[['run', 'cmo']])
else:
print("All rows in classification_data_combined found a match in run_condition.")
All rows in classification_data_combined found a match in run_condition.
classification_data = classification_data_combined
data_dir
'/group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/filtered_matrix'
all_data = []
# Find all run directories
run_dirs = glob.glob(os.path.join(data_dir, 'run-MTR_*'))
runs = [run_dir.split('/')[-1].split('_')[-1] for run_dir in run_dirs]
print(sorted(runs))
['491', '492', '493', '494', '495', '496', '497', '498', '499', '500', '501', '502', '503', '504', '505', '506', '507', '509', '510', '511', '512', '513']
all_data = process_runs(run_dirs)
Processed run: 501, Shape: (30472, 36601) Processed run: 506, Shape: (33800, 36601) Processed run: 505, Shape: (31503, 36601) Processed run: 510, Shape: (34286, 36601) Processed run: 512, Shape: (30454, 36601) Processed run: 498, Shape: (27504, 36601) Processed run: 499, Shape: (37685, 36601) Processed run: 497, Shape: (35626, 36601) Processed run: 509, Shape: (38982, 36601)Processed run: 503, Shape: (38015, 36601)Processed run: 513, Shape: (35133, 36601) Processed run: 511, Shape: (34897, 36601)Processed run: 491, Shape: (35618, 36601)Processed run: 495, Shape: (36867, 36601) Processed run: 493, Shape: (38899, 36601) Processed run: 502, Shape: (41429, 36601) Processed run: 500, Shape: (43971, 36601)Processed run: 507, Shape: (36806, 36601) Processed run: 492, Shape: (39679, 36601)Processed run: 494, Shape: (42089, 36601)Processed run: 504, Shape: (42043, 36601) Processed run: 496, Shape: (42509, 36601) Number of processed runs: 22
# Concatenate all data
combined_data = ad.concat(all_data, join='outer')
# Make observation names unique in the combined dataset
combined_data.var_names_make_unique()
combined_data.obs_names_make_unique()
combined_data.obs.head()
tags | run | |
---|---|---|
index | ||
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501 |
AAACCCAAGAACTGAT-1-501 | AAACCCAAGAACTGAT-1 | 501 |
AAACCCAAGCCTGTGC-1-501 | AAACCCAAGCCTGTGC-1 | 501 |
AAACCCAAGCGATGAC-1-501 | AAACCCAAGCGATGAC-1 | 501 |
AAACCCAAGCGCCGTT-1-501 | AAACCCAAGCGCCGTT-1 | 501 |
combined_data.obs['run'] = combined_data.obs['run'].astype(int)
combined_data.write_h5ad(os.path.join(output_dir, 'combined_raw_scRNA_seq_data.h5ad'))
# combined_data = ad.read_h5ad(os.path.join(output_dir, 'combined_raw_scRNA_seq_data.h5ad'))
# identify and remove duplicated tags
duplicate_obs = combined_data.obs_names.duplicated(keep=False)
duplicate_count = duplicate_obs.sum()
print(f"Number of duplicate observations: {duplicate_count}")
Number of duplicate observations: 0
# Remove duplicate observations
# combined_data = combined_data[~duplicate_obs].copy()
combined_data.obs.head()
tags | run | |
---|---|---|
index | ||
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501 |
AAACCCAAGAACTGAT-1-501 | AAACCCAAGAACTGAT-1 | 501 |
AAACCCAAGCCTGTGC-1-501 | AAACCCAAGCCTGTGC-1 | 501 |
AAACCCAAGCGATGAC-1-501 | AAACCCAAGCGATGAC-1 | 501 |
AAACCCAAGCGCCGTT-1-501 | AAACCCAAGCGCCGTT-1 | 501 |
classification_data.head()
unique_index | final_call | RescuedStatus | cmo | tags | run | run_cmo | condition | line | replicate | |
---|---|---|---|---|---|---|---|---|---|---|
0 | AAACCCAAGCTCAGAG-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGCTCAGAG-1 | 510 | 510309 | BPF | CTL04E | 3 |
1 | AAACCCAAGGAGCTGT-1-510 | CMO310 | Not_Resc | 310 | AAACCCAAGGAGCTGT-1 | 510 | 510310 | BPF | CTL08A | 1 |
2 | AAACCCAAGGCACAAC-1-510 | CMO312 | Not_Resc | 312 | AAACCCAAGGCACAAC-1 | 510 | 510312 | BPF | CTL08A | 3 |
3 | AAACCCAAGGGTTGCA-1-510 | CMO309 | Not_Resc | 309 | AAACCCAAGGGTTGCA-1 | 510 | 510309 | BPF | CTL04E | 3 |
4 | AAACCCACAAGTGGAC-1-510 | CMO309 | Not_Resc | 309 | AAACCCACAAGTGGAC-1 | 510 | 510309 | BPF | CTL04E | 3 |
classification_data = classification_data.set_index('unique_index')
combined_data.obs_names[:5]
Index(['AAACCCAAGAAACCCG-1-501', 'AAACCCAAGAACTGAT-1-501', 'AAACCCAAGCCTGTGC-1-501', 'AAACCCAAGCGATGAC-1-501', 'AAACCCAAGCGCCGTT-1-501'], dtype='object', name='index')
print(classification_data.index[:5])
print(combined_data.obs.index[:5])
Index(['AAACCCAAGCTCAGAG-1-510', 'AAACCCAAGGAGCTGT-1-510', 'AAACCCAAGGCACAAC-1-510', 'AAACCCAAGGGTTGCA-1-510', 'AAACCCACAAGTGGAC-1-510'], dtype='object', name='unique_index') Index(['AAACCCAAGAAACCCG-1-501', 'AAACCCAAGAACTGAT-1-501', 'AAACCCAAGCCTGTGC-1-501', 'AAACCCAAGCGATGAC-1-501', 'AAACCCAAGCGCCGTT-1-501'], dtype='object', name='index')
new_obs_data = pd.DataFrame(index=combined_data.obs_names)
columns_to_add = ['cmo', 'run', 'condition', 'line', 'replicate']
for col in columns_to_add:
new_obs_data[col] = classification_data.loc[classification_data.index.intersection(new_obs_data.index), col]
for col in columns_to_add:
combined_data.obs[col] = new_obs_data[col]
combined_data.obs.head()
tags | run | cmo | condition | line | replicate | |
---|---|---|---|---|---|---|
index | ||||||
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501.0 | 303.0 | THYROID_AGONIST | CTL04E | 3 |
AAACCCAAGAACTGAT-1-501 | AAACCCAAGAACTGAT-1 | NaN | NaN | NaN | NaN | NaN |
AAACCCAAGCCTGTGC-1-501 | AAACCCAAGCCTGTGC-1 | NaN | NaN | NaN | NaN | NaN |
AAACCCAAGCGATGAC-1-501 | AAACCCAAGCGATGAC-1 | NaN | NaN | NaN | NaN | NaN |
AAACCCAAGCGCCGTT-1-501 | AAACCCAAGCGCCGTT-1 | NaN | NaN | NaN | NaN | NaN |
unmatched_cells = new_obs_data[new_obs_data.isnull().any(axis=1)]
if not unmatched_cells.empty:
print(f"Warning: {len(unmatched_cells)} cells in combined_data.obs didn't find a match in processed_data_combined")
print("First few unmatched cells:")
print(unmatched_cells.head())
else:
print("All cells in combined_data.obs found a match in processed_data_combined")
Warning: 316206 cells in combined_data.obs didn't find a match in processed_data_combined First few unmatched cells: cmo run condition line replicate index AAACCCAAGAACTGAT-1-501 NaN NaN NaN NaN NaN AAACCCAAGCCTGTGC-1-501 NaN NaN NaN NaN NaN AAACCCAAGCGATGAC-1-501 NaN NaN NaN NaN NaN AAACCCAAGCGCCGTT-1-501 NaN NaN NaN NaN NaN AAACCCAAGTACGAGC-1-501 NaN NaN NaN NaN NaN
print(f"Number of cells before removing unmapped and duplicates: {combined_data.n_obs}")
mapped_cells_mask = combined_data.obs[['cmo', 'run', 'condition', 'line', 'replicate']].notna().all(axis=1)
unmapped_cells = (~mapped_cells_mask).sum()
print(f"Number of unmapped cells to be removed: {unmapped_cells}")
combined_data = combined_data[mapped_cells_mask]
Number of cells before removing unmapped and duplicates: 808267 Number of unmapped cells to be removed: 316206
# Check if there are any remaining NaN values
if combined_data.obs[['cmo', 'run', 'condition', 'line', 'replicate']].isna().any().any():
print("Warning: There are still some NaN values in the mapped columns.")
else:
print("All remaining cells are fully mapped with no NaN values.")
# Check if there are any remaining duplicates
if combined_data.obs_names.duplicated().any():
print("Warning: There are still some duplicate observation names.")
else:
print("All observation names are now unique.")
All remaining cells are fully mapped with no NaN values. All observation names are now unique.
combined_data.shape
(492061, 36601)
combined_data.obs['run'].value_counts().plot.bar(color=['#279e68', '#d62728', '#ff7f0e', '#1f77b4'])
<Axes: xlabel='run'>
dir_508 = "/group/testa/Project/EndPoints/2024_singleCell/nextflow/output/cellranger-multi/run-MTR_508/per_sample_outs/tmp-MTR_508/count"
adata = sc.read_10x_h5(os.path.join(dir_508, 'sample_filtered_feature_bc_matrix.h5'))
adata.obs["run"] = 508
adata.obs["condition"] = "RET_AGONIST"
adata.obs = adata.obs.reset_index().rename(columns={'index': 'tags'})
adata.obs['index'] = adata.obs['tags'].astype(str) + '-' + adata.obs['run'].astype(str)
adata.obs.set_index('index', inplace=True)
adata.var_names_make_unique()
adata.obs_names_make_unique()
print(f"Existing combined data shape: {combined_data.shape}")
print(f"Additional data shape: {adata.shape}")
/home/michal.kubacki/miniconda3/envs/rapids-24.06/lib/python3.11/site-packages/anndata/_core/anndata.py:1820: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var")
Existing combined data shape: (492061, 36601) Additional data shape: (38652, 36601)
/home/michal.kubacki/miniconda3/envs/rapids-24.06/lib/python3.11/site-packages/anndata/_core/anndata.py:1820: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var")
try:
updated_combined_data = ad.concat(
[combined_data, adata],
join='outer',
merge='first'
)
print(f"Updated combined data shape: {updated_combined_data.shape}")
except Exception as e:
print(f"Error during data combination: {str(e)}")
Updated combined data shape: (530713, 36601)
combined_data = updated_combined_data
combined_data.write_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data.h5ad'))
combined_data = ad.read_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data.h5ad'))
combined_data.obs.head()
tags | run | cmo | condition | line | replicate | |
---|---|---|---|---|---|---|
index | ||||||
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501.0 | 303.0 | THYROID_AGONIST | CTL04E | 3 |
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 501.0 | 306.0 | THYROID_AGONIST | CTL08A | 3 |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 501.0 | 305.0 | THYROID_AGONIST | CTL08A | 2 |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 501.0 | 306.0 | THYROID_AGONIST | CTL08A | 3 |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 501.0 | 303.0 | THYROID_AGONIST | CTL04E | 3 |
import pandas as pd
import numpy as np
print("Current categories:", combined_data.obs['condition'].cat.categories)
print("Number of NaN values:", combined_data.obs['condition'].isna().sum())
# Get the current categories and add 'Unknown'
current_categories = combined_data.obs['condition'].cat.categories.tolist()
if 'Unknown' not in current_categories:
current_categories.append('Unknown')
# Create a new categorical column with 'Unknown' included in categories
combined_data.obs['condition_clean'] = pd.Categorical(
combined_data.obs['condition'].astype(str).replace('nan', 'Unknown'),
categories=current_categories
)
condition_strings = combined_data.obs['condition_clean'].astype(str)
# Get all unique conditions
all_conditions = condition_strings.unique()
# Convert to a list and sort alphabetically
sorted_conditions = sorted(list(all_conditions))
# Print the sorted conditions
print("Conditions in alphabetical order:")
for condition in sorted_conditions:
print(condition)
Current categories: Index(['3-PBA', 'ANDROGEN_AGONIST', 'ANDROGEN_ANTAGONIST', 'ARYL_HYD_AGONIST', 'ARYL_HYD_ANTAGONIST', 'BPA', 'BPF', 'DMSO', 'DPHP', 'ESTROGEN_AGONIST', 'ESTROGEN_ANTAGONIST', 'GLUCOCORT_AGONIST', 'GLUCOCORT_ANTAGONIST', 'LIVER-X_AGONIST', 'LIVER-X_ANTAGONIST', 'MBzP', 'MEP', 'MIX', 'RET_AGONIST', 'RET_ANTAGONIST', 'TCP', 'THYROID_AGONIST', 'THYROID_ANTAGONIST'], dtype='object') Number of NaN values: 0 Conditions in alphabetical order: 3-PBA ANDROGEN_AGONIST ANDROGEN_ANTAGONIST ARYL_HYD_AGONIST ARYL_HYD_ANTAGONIST BPA BPF DMSO DPHP ESTROGEN_AGONIST ESTROGEN_ANTAGONIST GLUCOCORT_AGONIST GLUCOCORT_ANTAGONIST LIVER-X_AGONIST LIVER-X_ANTAGONIST MBzP MEP MIX RET_AGONIST RET_ANTAGONIST TCP THYROID_AGONIST THYROID_ANTAGONIST
data_dirs = {
"MTR_393_3GEX" : "/group/testa/Project/EndPoints/2024_singleCell/nextflow_RITM0022624/outputs_count/MTR_393_3GEX/outs",
"MTR_394_3GEX" : "/group/testa/Project/EndPoints/2024_singleCell/nextflow_RITM0022624/outputs_count/MTR_394_3GEX/outs"
}
all_data = []
for key, data_dir in data_dirs.items():
print(f"Loading {key}")
file_path = os.path.join(data_dir, 'filtered_feature_bc_matrix.h5')
adata = sc.read_10x_h5(file_path)
adata.obs["condition"] = key
adata.obs["run"] = key.split('_')[1]
adata.obs = adata.obs.reset_index()
adata.obs = adata.obs.rename(columns={'index': 'tags'})
adata.obs['index'] = adata.obs['tags'].astype(str) + '-' + adata.obs['run'].astype(str)
adata.obs = adata.obs.set_index('index')
adata.obs = adata.obs.rename_axis(None)
adata.obs_names_make_unique()
adata.var_names_make_unique()
print(adata.obs.head())
n_cells, n_genes = adata.shape
print(f"Number of cells: {n_cells}")
print(f"Number of genes: {n_genes}")
all_data.append(adata)
Loading MTR_393_3GEX
/home/michal.kubacki/miniconda3/envs/rapids-24.06/lib/python3.11/site-packages/anndata/_core/anndata.py:1820: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var") /home/michal.kubacki/miniconda3/envs/rapids-24.06/lib/python3.11/site-packages/anndata/_core/anndata.py:1820: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var")
tags condition run AAACCCAAGACCAAAT-1-393 AAACCCAAGACCAAAT-1 MTR_393_3GEX 393 AAACCCAAGACCCTTA-1-393 AAACCCAAGACCCTTA-1 MTR_393_3GEX 393 AAACCCAAGCAAGGAA-1-393 AAACCCAAGCAAGGAA-1 MTR_393_3GEX 393 AAACCCAAGCACTCAT-1-393 AAACCCAAGCACTCAT-1 MTR_393_3GEX 393 AAACCCAAGCCTGCCA-1-393 AAACCCAAGCCTGCCA-1 MTR_393_3GEX 393 Number of cells: 37830 Number of genes: 36601 Loading MTR_394_3GEX
/home/michal.kubacki/miniconda3/envs/rapids-24.06/lib/python3.11/site-packages/anndata/_core/anndata.py:1820: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var")
tags condition run AAACCCAAGACGGAAA-1-394 AAACCCAAGACGGAAA-1 MTR_394_3GEX 394 AAACCCAAGAGGGTGG-1-394 AAACCCAAGAGGGTGG-1 MTR_394_3GEX 394 AAACCCAAGATACCAA-1-394 AAACCCAAGATACCAA-1 MTR_394_3GEX 394 AAACCCAAGCCTGTCG-1-394 AAACCCAAGCCTGTCG-1 MTR_394_3GEX 394 AAACCCAAGCGCCCAT-1-394 AAACCCAAGCGCCCAT-1 MTR_394_3GEX 394 Number of cells: 53030 Number of genes: 36601
/home/michal.kubacki/miniconda3/envs/rapids-24.06/lib/python3.11/site-packages/anndata/_core/anndata.py:1820: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`. utils.warn_names_duplicates("var")
# Concatenate all data
to_add_combined = ad.concat(all_data, join='outer')
to_add_combined.obs.head()
tags | condition | run | |
---|---|---|---|
AAACCCAAGACCAAAT-1-393 | AAACCCAAGACCAAAT-1 | MTR_393_3GEX | 393 |
AAACCCAAGACCCTTA-1-393 | AAACCCAAGACCCTTA-1 | MTR_393_3GEX | 393 |
AAACCCAAGCAAGGAA-1-393 | AAACCCAAGCAAGGAA-1 | MTR_393_3GEX | 393 |
AAACCCAAGCACTCAT-1-393 | AAACCCAAGCACTCAT-1 | MTR_393_3GEX | 393 |
AAACCCAAGCCTGCCA-1-393 | AAACCCAAGCCTGCCA-1 | MTR_393_3GEX | 393 |
combined_data.obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | |
---|---|---|---|---|---|---|---|
index | |||||||
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501.0 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST |
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 501.0 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 501.0 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 501.0 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 501.0 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST |
combined = ad.concat([combined_data, to_add_combined], join='outer', merge='first')
combined
AnnData object with n_obs × n_vars = 621573 × 36601 obs: 'tags', 'run', 'cmo', 'condition', 'line', 'replicate', 'condition_clean' var: 'gene_ids', 'feature_types', 'genome'
combined.obs.run = combined.obs.run.astype(int)
combined.obs.head()
tags | run | cmo | condition | line | replicate | condition_clean | |
---|---|---|---|---|---|---|---|
AAACCCAAGAAACCCG-1-501 | AAACCCAAGAAACCCG-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST |
AAACCCAAGGACGCAT-1-501 | AAACCCAAGGACGCAT-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST |
AAACCCAAGTTGAAAC-1-501 | AAACCCAAGTTGAAAC-1 | 501 | 305.0 | THYROID_AGONIST | CTL08A | 2 | THYROID_AGONIST |
AAACCCACAACGGGTA-1-501 | AAACCCACAACGGGTA-1 | 501 | 306.0 | THYROID_AGONIST | CTL08A | 3 | THYROID_AGONIST |
AAACCCACAAGATCCT-1-501 | AAACCCACAAGATCCT-1 | 501 | 303.0 | THYROID_AGONIST | CTL04E | 3 | THYROID_AGONIST |
combined.obs[combined.obs.run.isin([393, 394])].head()
tags | run | cmo | condition | line | replicate | condition_clean | |
---|---|---|---|---|---|---|---|
AAACCCAAGACCAAAT-1-393 | AAACCCAAGACCAAAT-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN |
AAACCCAAGACCCTTA-1-393 | AAACCCAAGACCCTTA-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN |
AAACCCAAGCAAGGAA-1-393 | AAACCCAAGCAAGGAA-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN |
AAACCCAAGCACTCAT-1-393 | AAACCCAAGCACTCAT-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN |
AAACCCAAGCCTGCCA-1-393 | AAACCCAAGCCTGCCA-1 | 393 | NaN | MTR_393_3GEX | NaN | NaN | NaN |
combined.obs['run'].value_counts().plot.bar(color=['#279e68', '#d62728', '#ff7f0e', '#1f77b4'])
<Axes: xlabel='run'>
sorted_runs = sorted(combined.obs['run'].unique())
combined.obs['run'].value_counts().reindex(sorted_runs).plot.bar(color=['#279e68', '#d62728', '#ff7f0e', '#1f77b4'])
<Axes: xlabel='run'>
combined.write_h5ad(os.path.join(output_dir, 'combined_scRNA_seq_data_39X.h5ad'))
# Check if each run is assigned to only one condition
run_to_condition = combined.obs.groupby('run')['condition'].nunique()
single_condition_per_run = (run_to_condition == 1).all()
# Check if each condition is assigned to only one run
condition_to_run = combined.obs.groupby('condition')['run'].nunique()
single_run_per_condition = (condition_to_run == 1).all()
print(f"Each run is assigned to only one condition: {single_condition_per_run}")
print(f"Each condition is assigned to only one run: {single_run_per_condition}")
Each run is assigned to only one condition: True Each condition is assigned to only one run: True
/localscratch/13399779.michal.kubacki/ipykernel_1007946/2822496074.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. condition_to_run = combined.obs.groupby('condition')['run'].nunique()
combined
AnnData object with n_obs × n_vars = 621573 × 36601 obs: 'tags', 'run', 'cmo', 'condition', 'line', 'replicate', 'condition_clean' var: 'gene_ids', 'feature_types', 'genome'
# for run_dir in run_dirs:
# try:
# print(f"Processing run: {run_dir}")
# adata = sc.read_h5ad(os.path.join(run_dir, 'matrix.h5ad'))
# adata.obs["run"] = run_dir.split('/')[-1].split('_')[-1]
# adata.obs = adata.obs.reset_index()
# adata.obs = adata.obs.rename(columns={'index': 'tags'})
# adata.obs['index'] = adata.obs['tags'].astype(str) + '-' + adata.obs['run'].astype(str)
# adata.obs = adata.obs.set_index('index')
# adata.var_names_make_unique()
# adata.obs_names_make_unique()
# print(f"Loaded AnnData object: {adata.shape}")
# all_data.append(adata)
# except Exception as e:
# print(f"Error processing {run_dir}: {str(e)}")
# print(f"Number of processed runs: {len(all_data)}")