Skip to content

Commit

Permalink
Paper release
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurDondi committed Sep 19, 2024
1 parent 395066f commit 26a078e
Show file tree
Hide file tree
Showing 47 changed files with 3,148 additions and 1,294 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
logs/

#Large files
CTAT_LR_Fusion/ctat_lr_fusion.v0.10.0.simg
CTAT_LR_Fusion/ctat_lr_fusion.v0.10.0.simg
2 changes: 2 additions & 0 deletions BnpC/libs/format_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def filter_input(bin,vaf,scDNA,ctypes,min_cells_per_mut,min_pos_cov,out_prefix):

# Filter all input files:
vaf = vaf.loc[idx,cols]
if not 'chr' in idx[0]:
scDNA['INDEX'] = [i.split('chr')[1] if '--' not in i else i for i in list(scDNA['INDEX'])]
scDNA = scDNA[scDNA['INDEX'].isin(idx)]
ctypes = ctypes[ctypes['Index'].isin(cols)]
bin = pd.concat([bin,fusions_save[cols]])
Expand Down
20 changes: 16 additions & 4 deletions BnpC/libs/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,18 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,
cmap = plt.get_cmap('Reds', 2)
cmap.set_over('green')
cmap.set_bad('grey')
# if not 'CNV_Colors' in ctypes.columns:
# ctypes['CNV_Colors']='black'
# if not 'Cell_Reanno_Colors' in ctypes.columns:
# ctypes['Cell_Reanno_Colors']='black'
# if not 'CNV_Colors' in ctypes.columns:

# if not 'Seurat_color' in ctypes.columns:
# col_colors=[ctypes['CNV_Colors'],ctypes['Cell_Reanno_Colors'],cluster_cols]
# else:
# col_colors=[ctypes['CNV_Colors'],ctypes['Seurat_color'],ctypes['Cell_Reanno_Colors'],cluster_cols]

col_colors=[ctypes['Cancer_Color'],cluster_cols]

cm = sns.clustermap(
data,
Expand All @@ -150,7 +162,7 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,
vmin=0, vmax=1,
cmap=cmap, fmt='',
linewidths=0, linecolor='lightgray',
col_colors=[ctypes['Cancer_Color'],cluster_cols],
col_colors=col_colors,
row_colors= [scDNASupport_Tumor,scDNASupport_NonTumor],
colors_ratio=(0.3, 0.3), col_cluster=False,
row_cluster=False, figsize=(width, height)
Expand All @@ -172,14 +184,14 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None,

try:
cm.gs.set_width_ratios([0, 1])
cm.gs.set_height_ratios([0, 0.05, 0.95])
cm.gs.set_height_ratios([0, 0.15, 0.85])
except ValueError:
try:
cm.gs.set_width_ratios([0, 0.08, 1])
cm.gs.set_height_ratios([0, 0, 0.05, 0.95])
cm.gs.set_height_ratios([0, 0, 0.15, 0.85])
except ValueError:
cm.gs.set_width_ratios([0, 0.08, 1])
cm.gs.set_height_ratios([0, 0.05, 0.95])
cm.gs.set_height_ratios([0, 0.15, 0.85])
cm.gs.update(left=0.05, bottom=0.05, right=0.95, top=0.95)


Expand Down
1,076 changes: 1,076 additions & 0 deletions QC/chrM_conta_valid/MitoContaFigures.ipynb

Large diffs are not rendered by default.

110 changes: 110 additions & 0 deletions QC/chrM_conta_valid/chrM_conta_valid.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import pandas as pd

workdir: config['specific']['workdir']
SAMPLES = config['samples'].keys()
BIN = config['specific']['scripts']

def get_mem_mb(wildcards, threads):
return threads * 1024

def get_cellranger_rawfiles(wildcards):
return '{}/raw_feature_bc_matrix/barcodes.tsv.gz'.format(config['samples'][wildcards.sample]["cellranger_folder"])

def get_cellranger_filteredfiles(wildcards):
return '{}/filtered_feature_bc_matrix/barcodes.tsv'.format(config['samples'][wildcards.sample]["cellranger_folder"])

def sample2ids(wildcards):
return expand('input_flntc/{{sample}}_{id}.fltnc.bam',
id = config['samples'][wildcards.sample]['ids'])

rule all:
input:
expand('results/{sample}/fraction_alt_emptydrops.done',sample=SAMPLES)


rule identify_emptydroplets:
input:
raw = get_cellranger_rawfiles,
filtered = get_cellranger_filteredfiles
output:
emptydrops = "emptydroplets/barcodes/{sample}.tsv"
params:
bin_path = BIN
shell:
"python {params.bin_path}/empty_droplets_listing.py --raw {input.raw} "
"--filtered {input.filtered} --sample {wildcards.sample}"


rule emptydroplets_fastq:
input:
spl = sample2ids,
emptydrops = "emptydroplets/barcodes/{sample}.tsv"
output:
fastq = 'emptydroplets/{sample}.fastq.gz'
params:
bin_path = BIN
conda:
"pysam"
threads:
8
resources:
time = 1200,
mem_mb = 32000
shell:
"python {params.bin_path}/empty_droplets_fastq.py --sample {wildcards.sample} "
"--emptydroplets {input.emptydrops} --cpu {threads} "


rule mapping:
input:
fastq = 'emptydroplets/{sample}.fastq.gz'
output:
sam = "emptydroplets/{sample}.sam",
params:
hg38 = config['specific']['genome']
conda:
"isoseq"
threads:
32
resources:
mem_mb = get_mem_mb
shell:
"minimap2 -t 30 -ax splice -uf --secondary=no -C5 "
"{params.hg38} {input.fastq} > {output.sam}"

rule sam_to_sortedbam:
input:
sam = ancient("emptydroplets/{sample}.sam")
output:
bam = "emptydroplets/{sample}.bam",
bai = "emptydroplets/{sample}.bam.bai"
conda:
"samtools"
threads:
8
resources:
mem_mb = get_mem_mb
shell:
"samtools sort -@ {threads} {input.sam} -o {output.bam}##idx##{output.bai} --write-index"

rule fraction_alt_emptydrops:
input:
bam = "emptydroplets/{sample}.bam",
bai = "emptydroplets/{sample}.bam.bai",
vcf = "longsom_muts/{sample}.BnpC_input.vcf"
output:
touch('results/{sample}/fraction_alt_emptydrops.done')
conda:
"pysam"
threads:
8
resources:
time = 1200,
mem_mb = 32000
params:
bin_path = BIN
shell:
"python {params.bin_path}/fraction_alt_emptydrops.py --bam {input.bam} "
"--vcf {input.vcf} --sample {wildcards.sample} --cpu {threads}"


22 changes: 22 additions & 0 deletions QC/chrM_conta_valid/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
specific:
workdir: "/cluster/work/bewi/members/dondia/projects/long_reads_tree/chrM_conta_valid"
scripts: "/cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/ctat_mut/chrM_conta_valid"
genome: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/reference/hg38.fa"

samples:
Patient1_Tum:
ids: [1,2,3,4]
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B486_Tumor_1000cells/cellranger_run/B486_Tumor_1000cells/outs"
Patient1_Om:
ids: [1,2]
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B486_Omentum_Distal_1000cells/cellranger_run/B486_Omentum_Distal_1000cells/outs"
Patient2_Tum:
ids: [1,2,3,4]
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B497_Tumor/analysis/cellranger_run/B497_Tumor/outs"
Patient3_Tum:
ids: [1,2,3,4]
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B500_Tumor/analysis/cellranger_run/B500_Tumor/outs"
Patient3_Om:
ids: [1,2]
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B500_Distal/analysis/cellranger_run/B500_Distal/outs"

100 changes: 100 additions & 0 deletions QC/chrM_conta_valid/empty_droplets_fastq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/cluster/work/bewi/members/dondia/Anaconda3/envs/pysam/bin/python

import pandas as pd
import numpy as np
import pysam
from pathlib import Path
import glob
import argparse
import gzip
from collections import defaultdict

def reverse_complement(seq):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
bases = list(seq)
letters = [complement[base] for base in bases]
letters = ''.join(letters)
reverse = letters[::-1]
return reverse

def get_emptydrops(emptydroplets):
emptydrops = pd.read_csv(emptydroplets, sep = '\t').barcodes
emptydrops = [reverse_complement(i) for i in emptydrops.values]
return set(emptydrops)

def bam_to_fastq(read):
name = read.query_name
seq = read.query_sequence
qual = read.qual
return "@{}\n{}\n+\n{}\n".format(name,seq,qual)

def read_fltnc(sample,emptydrops):

dic_bam_per_cell=defaultdict(lambda:[])
dic_UMI_per_cell=defaultdict(lambda:defaultdict(lambda:True))

bams = glob.glob('input_flntc/{}*.bam'.format(sample))

for bamfile in bams:
samfile = pysam.AlignmentFile(bamfile, "rb", check_sq=False, threads=args.cpu)
for READ in samfile:
try:
BC = READ.get_tag('XC')
UMI = READ.get_tag('XM')
except KeyError:
continue
if str(BC) in emptydrops:
READ.query_name = READ.query_name + '_' + BC
if dic_UMI_per_cell[BC][UMI]:
dic_bam_per_cell[BC].append(READ)
dic_UMI_per_cell[BC][UMI]=False

print("{} mean reads per empty droplet: {} reads".format(sample,
np.mean([len(i) for i in dic_bam_per_cell.values()])))
print("{} # dead cell: {}".format(sample, len(dic_bam_per_cell.values())))
print()

return dic_bam_per_cell


def main(args):

sample = args.sample

emptydrops = get_emptydrops(args.emptydroplets)

dic_bam_per_cell= read_fltnc(sample,emptydrops)

path = 'emptydroplets/{}.fastq.gz'.format(sample)
with gzip.open(path, 'wt') as fastq:
for BC in dic_bam_per_cell:
for read in dic_bam_per_cell[BC]:
fastq.write(bam_to_fastq(read))


def parse_args():
parser = argparse.ArgumentParser(
prog='empty_droplets_listing.py',
usage='python3 empty_droplets_listing.py --emptydroplets <emptydrops.tsv> --sample <sample> ',
description='Divides bamfiles per cell prior to UMI deduplication'
)
parser.add_argument(
'--emptydroplets', type=str,
help='path(s) to directory containing barcodes of cellranger empty droplets, tsv format'
)
parser.add_argument(
'--sample', type=str,
help='sample name (should not contain ".")'
)
parser.add_argument(
'--cpu', type=int, default=8,
help='# CPUs to use'
)


args = parser.parse_args()
return args

if __name__ == '__main__':
args = parse_args()
main(args)
43 changes: 43 additions & 0 deletions QC/chrM_conta_valid/empty_droplets_listing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd
import argparse

def main(args):
filtered = pd.read_csv(args.filtered, names=['barcodes'], sep='\t')
raw = pd.read_csv(args.raw, names=['barcodes'],compression='gzip', encoding = "ISO-8859-1", sep='\t')
filtered_set = set(filtered['barcodes'])
raw_list = list(raw['barcodes'])


emptydrops = {cell[:-2] for cell in raw_list if cell not in filtered_set}

csv = pd.DataFrame({'barcodes': list(emptydrops)})
csv.to_csv('emptydroplets/barcodes/{}.tsv'.format(args.sample), sep='\t', index=False)

def parse_args():
parser = argparse.ArgumentParser(
prog='split_cells_bam.py',
usage='python3 split_cells_bam.py --bc_dir <bc_dir> --sample <sample> ',
description='Divides bamfiles per cell prior to UMI deduplication'
)
parser.add_argument(
'--filtered', type=str,
help='Absolute or relative path(s) to directory containing barcodes files sample.whatever.txt, tsv format'
)
parser.add_argument(
'--raw', type=str,
help='sample name (should not contain ".")'
)
parser.add_argument(
'--sample', type=str,
help='sample name (should not contain ".")'
)

args = parser.parse_args()
return args

if __name__ == '__main__':
args = parse_args()
main(args)



Loading

0 comments on commit 26a078e

Please sign in to comment.