-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
395066f
commit 26a078e
Showing
47 changed files
with
3,148 additions
and
1,294 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,4 +7,4 @@ | |
logs/ | ||
|
||
#Large files | ||
CTAT_LR_Fusion/ctat_lr_fusion.v0.10.0.simg | ||
CTAT_LR_Fusion/ctat_lr_fusion.v0.10.0.simg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import pandas as pd | ||
|
||
workdir: config['specific']['workdir'] | ||
SAMPLES = config['samples'].keys() | ||
BIN = config['specific']['scripts'] | ||
|
||
def get_mem_mb(wildcards, threads): | ||
return threads * 1024 | ||
|
||
def get_cellranger_rawfiles(wildcards): | ||
return '{}/raw_feature_bc_matrix/barcodes.tsv.gz'.format(config['samples'][wildcards.sample]["cellranger_folder"]) | ||
|
||
def get_cellranger_filteredfiles(wildcards): | ||
return '{}/filtered_feature_bc_matrix/barcodes.tsv'.format(config['samples'][wildcards.sample]["cellranger_folder"]) | ||
|
||
def sample2ids(wildcards): | ||
return expand('input_flntc/{{sample}}_{id}.fltnc.bam', | ||
id = config['samples'][wildcards.sample]['ids']) | ||
|
||
rule all: | ||
input: | ||
expand('results/{sample}/fraction_alt_emptydrops.done',sample=SAMPLES) | ||
|
||
|
||
rule identify_emptydroplets: | ||
input: | ||
raw = get_cellranger_rawfiles, | ||
filtered = get_cellranger_filteredfiles | ||
output: | ||
emptydrops = "emptydroplets/barcodes/{sample}.tsv" | ||
params: | ||
bin_path = BIN | ||
shell: | ||
"python {params.bin_path}/empty_droplets_listing.py --raw {input.raw} " | ||
"--filtered {input.filtered} --sample {wildcards.sample}" | ||
|
||
|
||
rule emptydroplets_fastq: | ||
input: | ||
spl = sample2ids, | ||
emptydrops = "emptydroplets/barcodes/{sample}.tsv" | ||
output: | ||
fastq = 'emptydroplets/{sample}.fastq.gz' | ||
params: | ||
bin_path = BIN | ||
conda: | ||
"pysam" | ||
threads: | ||
8 | ||
resources: | ||
time = 1200, | ||
mem_mb = 32000 | ||
shell: | ||
"python {params.bin_path}/empty_droplets_fastq.py --sample {wildcards.sample} " | ||
"--emptydroplets {input.emptydrops} --cpu {threads} " | ||
|
||
|
||
rule mapping: | ||
input: | ||
fastq = 'emptydroplets/{sample}.fastq.gz' | ||
output: | ||
sam = "emptydroplets/{sample}.sam", | ||
params: | ||
hg38 = config['specific']['genome'] | ||
conda: | ||
"isoseq" | ||
threads: | ||
32 | ||
resources: | ||
mem_mb = get_mem_mb | ||
shell: | ||
"minimap2 -t 30 -ax splice -uf --secondary=no -C5 " | ||
"{params.hg38} {input.fastq} > {output.sam}" | ||
|
||
rule sam_to_sortedbam: | ||
input: | ||
sam = ancient("emptydroplets/{sample}.sam") | ||
output: | ||
bam = "emptydroplets/{sample}.bam", | ||
bai = "emptydroplets/{sample}.bam.bai" | ||
conda: | ||
"samtools" | ||
threads: | ||
8 | ||
resources: | ||
mem_mb = get_mem_mb | ||
shell: | ||
"samtools sort -@ {threads} {input.sam} -o {output.bam}##idx##{output.bai} --write-index" | ||
|
||
rule fraction_alt_emptydrops: | ||
input: | ||
bam = "emptydroplets/{sample}.bam", | ||
bai = "emptydroplets/{sample}.bam.bai", | ||
vcf = "longsom_muts/{sample}.BnpC_input.vcf" | ||
output: | ||
touch('results/{sample}/fraction_alt_emptydrops.done') | ||
conda: | ||
"pysam" | ||
threads: | ||
8 | ||
resources: | ||
time = 1200, | ||
mem_mb = 32000 | ||
params: | ||
bin_path = BIN | ||
shell: | ||
"python {params.bin_path}/fraction_alt_emptydrops.py --bam {input.bam} " | ||
"--vcf {input.vcf} --sample {wildcards.sample} --cpu {threads}" | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
specific: | ||
workdir: "/cluster/work/bewi/members/dondia/projects/long_reads_tree/chrM_conta_valid" | ||
scripts: "/cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/ctat_mut/chrM_conta_valid" | ||
genome: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/reference/hg38.fa" | ||
|
||
samples: | ||
Patient1_Tum: | ||
ids: [1,2,3,4] | ||
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B486_Tumor_1000cells/cellranger_run/B486_Tumor_1000cells/outs" | ||
Patient1_Om: | ||
ids: [1,2] | ||
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B486_Omentum_Distal_1000cells/cellranger_run/B486_Omentum_Distal_1000cells/outs" | ||
Patient2_Tum: | ||
ids: [1,2,3,4] | ||
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B497_Tumor/analysis/cellranger_run/B497_Tumor/outs" | ||
Patient3_Tum: | ||
ids: [1,2,3,4] | ||
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B500_Tumor/analysis/cellranger_run/B500_Tumor/outs" | ||
Patient3_Om: | ||
ids: [1,2] | ||
cellranger_folder: "/cluster/work/bewi/members/dondia/projects/ovarian_cancer/10x_data/B500_Distal/analysis/cellranger_run/B500_Distal/outs" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/cluster/work/bewi/members/dondia/Anaconda3/envs/pysam/bin/python | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import pysam | ||
from pathlib import Path | ||
import glob | ||
import argparse | ||
import gzip | ||
from collections import defaultdict | ||
|
||
def reverse_complement(seq): | ||
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} | ||
bases = list(seq) | ||
letters = [complement[base] for base in bases] | ||
letters = ''.join(letters) | ||
reverse = letters[::-1] | ||
return reverse | ||
|
||
def get_emptydrops(emptydroplets): | ||
emptydrops = pd.read_csv(emptydroplets, sep = '\t').barcodes | ||
emptydrops = [reverse_complement(i) for i in emptydrops.values] | ||
return set(emptydrops) | ||
|
||
def bam_to_fastq(read): | ||
name = read.query_name | ||
seq = read.query_sequence | ||
qual = read.qual | ||
return "@{}\n{}\n+\n{}\n".format(name,seq,qual) | ||
|
||
def read_fltnc(sample,emptydrops): | ||
|
||
dic_bam_per_cell=defaultdict(lambda:[]) | ||
dic_UMI_per_cell=defaultdict(lambda:defaultdict(lambda:True)) | ||
|
||
bams = glob.glob('input_flntc/{}*.bam'.format(sample)) | ||
|
||
for bamfile in bams: | ||
samfile = pysam.AlignmentFile(bamfile, "rb", check_sq=False, threads=args.cpu) | ||
for READ in samfile: | ||
try: | ||
BC = READ.get_tag('XC') | ||
UMI = READ.get_tag('XM') | ||
except KeyError: | ||
continue | ||
if str(BC) in emptydrops: | ||
READ.query_name = READ.query_name + '_' + BC | ||
if dic_UMI_per_cell[BC][UMI]: | ||
dic_bam_per_cell[BC].append(READ) | ||
dic_UMI_per_cell[BC][UMI]=False | ||
|
||
print("{} mean reads per empty droplet: {} reads".format(sample, | ||
np.mean([len(i) for i in dic_bam_per_cell.values()]))) | ||
print("{} # dead cell: {}".format(sample, len(dic_bam_per_cell.values()))) | ||
print() | ||
|
||
return dic_bam_per_cell | ||
|
||
|
||
def main(args): | ||
|
||
sample = args.sample | ||
|
||
emptydrops = get_emptydrops(args.emptydroplets) | ||
|
||
dic_bam_per_cell= read_fltnc(sample,emptydrops) | ||
|
||
path = 'emptydroplets/{}.fastq.gz'.format(sample) | ||
with gzip.open(path, 'wt') as fastq: | ||
for BC in dic_bam_per_cell: | ||
for read in dic_bam_per_cell[BC]: | ||
fastq.write(bam_to_fastq(read)) | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
prog='empty_droplets_listing.py', | ||
usage='python3 empty_droplets_listing.py --emptydroplets <emptydrops.tsv> --sample <sample> ', | ||
description='Divides bamfiles per cell prior to UMI deduplication' | ||
) | ||
parser.add_argument( | ||
'--emptydroplets', type=str, | ||
help='path(s) to directory containing barcodes of cellranger empty droplets, tsv format' | ||
) | ||
parser.add_argument( | ||
'--sample', type=str, | ||
help='sample name (should not contain ".")' | ||
) | ||
parser.add_argument( | ||
'--cpu', type=int, default=8, | ||
help='# CPUs to use' | ||
) | ||
|
||
|
||
args = parser.parse_args() | ||
return args | ||
|
||
if __name__ == '__main__': | ||
args = parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import pandas as pd | ||
import argparse | ||
|
||
def main(args): | ||
filtered = pd.read_csv(args.filtered, names=['barcodes'], sep='\t') | ||
raw = pd.read_csv(args.raw, names=['barcodes'],compression='gzip', encoding = "ISO-8859-1", sep='\t') | ||
filtered_set = set(filtered['barcodes']) | ||
raw_list = list(raw['barcodes']) | ||
|
||
|
||
emptydrops = {cell[:-2] for cell in raw_list if cell not in filtered_set} | ||
|
||
csv = pd.DataFrame({'barcodes': list(emptydrops)}) | ||
csv.to_csv('emptydroplets/barcodes/{}.tsv'.format(args.sample), sep='\t', index=False) | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
prog='split_cells_bam.py', | ||
usage='python3 split_cells_bam.py --bc_dir <bc_dir> --sample <sample> ', | ||
description='Divides bamfiles per cell prior to UMI deduplication' | ||
) | ||
parser.add_argument( | ||
'--filtered', type=str, | ||
help='Absolute or relative path(s) to directory containing barcodes files sample.whatever.txt, tsv format' | ||
) | ||
parser.add_argument( | ||
'--raw', type=str, | ||
help='sample name (should not contain ".")' | ||
) | ||
parser.add_argument( | ||
'--sample', type=str, | ||
help='sample name (should not contain ".")' | ||
) | ||
|
||
args = parser.parse_args() | ||
return args | ||
|
||
if __name__ == '__main__': | ||
args = parse_args() | ||
main(args) | ||
|
||
|
||
|
Oops, something went wrong.