diff --git a/.gitignore b/.gitignore index 9c5db4e..f13532b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ #snakemake logs folders .snakemake/ logs/ +benchmarks/ #Large files -CTAT_LR_Fusion/ctat_lr_fusion.v0.10.0.simg +ctat_lr_fusion.v0.13.0.simg + +*test* \ No newline at end of file diff --git a/config/config.yaml b/config/config.yaml index 0d574d0..c4a9a01 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,134 +1,123 @@ # LongSom config yaml file +# Change those values to yours +User: + input_dir: /path/to/input_dir + output_dir: /path/to/output_dir + sample_map: /path/to/sample_map.tsv + cancer_cell_type: HGSOC + +# Change if you use a custom reference +Reference: + genome: /../ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir/ref_genome.fa + isoforms: /../ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir/ref_annot.gtf + gnomAD_db: /../ref/gnomAD_v4.1 + RNA_editing: /../ref/AllEditingSites.hg38.txt.gz + PoN_SR: /../ref/PoN.scRNAseq.hg38.tsv.gz + +# Change any of those to True if you want to run it, else False Run: + # Create a PoN based on Normals + PoN: False # Run cell type reannotation - reanno: True + CellTypeReannotation: True # Run SNV calling with SComatic - scomatic: True - # Run SNV annotation with annovar - annovar: False + SNVCalling: True # Run cell clustering with BnpC - bnpc: True + CellClustering: True # Run fusion calling with CTAT-LR-Fusion - ctatfusion: True + FusionCalling: True # Run CNV calling with InferCNV - infercnv: False - # Generate various plots - plot: False - # Run scDNA validation (DEV only) - scdna: False - -Global: - outdir: /cluster/work/bewi/members/dondia/projects/long_reads_tree/LongSom_out/OvCa_LR - data: /cluster/work/bewi/members/dondia/projects/long_reads_tree/data/SComatic_input/OvCa_LR - qc: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/LongSom/QC/scripts - scomatic: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/LongSom/SComatic/scripts - bnpc: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/LongSom/BnpC - annovar: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/annovar - subread: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/subread-2.0.6-source - infercnv: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/LongSom/InferCNV - genome: /cluster/work/bewi/members/dondia/projects/ovarian_cancer/reference/hg38.fa - isoforms: /cluster/work/bewi/members/dondia/projects/ovarian_cancer/reference/gencode.v36.annotation.gtf - ids: - B486: "" - B497: "" - B500: "" + CNACalling: False PoN: - celltypes: - Normal: "" - ids: - B486_Norm: "" - B500_Norm: "" min_ac_cells: 1 min_ac_reads: 1 min_cells: 1 min_cell_types: 1 -CellTypeReannotation: - cancer_ctype: HGSOC - noncancer_ctype: Non-HGSOC - ctypes: ['HGSOC', 'Non-HGSOC'] - celltypes: - HGSOC: "" - Non-HGSOC: "" - ctype_column: Automated_Cell_type - chrM_contaminant: 'False' +### Cell Type Reannotation +Reanno: + + BaseCellCounter: + min_mapping_quality: 60 + chromosomes: all + + BaseCellCalling: + Min_cell_types: 2 + min_distance: 0 + max_gnomAD_VAF: 0.01 + min_ac_cells: 5 + min_ac_reads: 20 + alpha1: 0.21356677091082193 + beta1: 104.95163748636298 + alpha2: 0.2474528917555431 + beta2: 162.03696139428595 + HCCV: - min_depth: 20 - deltaVAF: 0.25 - deltaCCF: 0.4 + min_depth: 50 + deltaVAF: 0.2 + deltaMCF: 0.25 + clust_dist: 10000 + chrM_contaminant: 'False' + alt_flag: All + pvalue: 0.01 + Reannotation: min_variants: 3 min_fraction: 0.25 +### SNV Calling SNVCalling: - cancer_ctype: Cancer - celltypes: - Cancer: "" - NonCancer: "" - min_ac_reads: 3 - min_ac_cells: 2 - clust_dist: 10000 - clustermap: - height: 10 - width: 15 - -scDNA: - Tumor_clone: 'Clone_Tum' - NonTumor_clone: 'Clone_NonTum' - BaseCellCounter: - min_mapping_quality: 30 - min_dp: 5 - min_cc: 1 - BaseCellCalling: - min_ac_cells: 2 - min_ac_reads: 3 - -scDNAValidation: - min_depth: 17 - -SComatic: - chrM_contaminant: 'True' + BaseCellCounter: min_mapping_quality: 60 - chromosomes: all + chromosomes: all BaseCellCalling: Min_cell_types: 2 - RNA_editing: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/SComatic/RNAediting/AllEditingSites.hg38.txt min_distance: 0 - PoN_SR: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/SComatic/PoNs/PoN.scRNAseq.hg38.tsv - gnomAD_db: /cluster/work/bewi/members/dondia/projects/long_reads_tree/ref/gnomAD_v4.1 max_gnomAD_VAF: 0.01 deltaVAF: 0.1 - deltaCCF: 0.3 - min_ac_cells: 1 - min_ac_reads: 1 + deltaMCF: 0.3 + min_ac_reads: 3 + min_ac_cells: 2 + clust_dist: 10000 + chrM_contaminant: 'True' + alpha1: 0.21356677091082193 + beta1: 104.95163748636298 + alpha2: 0.2474528917555431 + beta2: 162.03696139428595 + +### Fusion Calling +FusionCalling: + SomaticFusions: + min_ac_reads: 3 + min_ac_cells: 2 + max_MCF_noncancer: 0.1 + deltaMCF: 0.3 + +### Cell Clustering +CellClust: SingleCellGenotype: alt_flag: All pvalue: 0.01 -BnpC: - outdir: '3kSteps.0.001-5dpa' - min_cells_per_mut: 5 - min_pos_cov: 3 - mcmc_steps: 1000 - cup: 0 - eup: 0 - FP: -1 - FN: -1 - estimator: 'posterior' - pp: [1,1] - dpa: - B486: [0.001,5] - B497: [1,1] - B500: [1,1] - mut_order: - B486: '' #/cluster/work/bewi/members/dondia/projects/long_reads_tree/LongSom_out/OvCa_LR/BnpC/BnpC_input/B486.order.tsv' - B497: '' - B500: '' - + FormatInput: + min_cells_per_mut: 5 + min_pos_cov: 3 + + BnpC: + mcmc_steps: 1000 + cup: 0 + eup: 0 + FP: -1 + FN: -1 + estimator: 'posterior' + pp: [1,1] + dpa: [1,1] + + ### CNA Calling inferCNV: - ordering: /cluster/work/bewi/members/dondia/projects/long_reads_tree/bin/LongSom/InferCNV/gene_ordering_file.txt + ordering: ../scripts/CNACalling/gene_ordering_file.txt diff --git a/profile/config.yaml b/profile/config.yaml index 488b189..1924afa 100644 --- a/profile/config.yaml +++ b/profile/config.yaml @@ -1,19 +1,11 @@ executor: slurm -latency-wait: 60 +latency-wait: 30 jobs: 500 -cores: 100 +cores: 500 default-resources: - cpus_per_task: 8 - mem_mb_per_cpu: 1024 + mem_mb_per_cpu: max(2*input.size_mb, 4096) + runtime: 240 slurm_account: "'es_beere'" - tmpdir: "'/scratch'" - time: 1200 - runtime: 1200 - -set-threads: - BaseCellCounter_scDNACalling: 64 -set-resources: - BaseCellCounter_scDNACalling: - cpus_per_task: 64 \ No newline at end of file + #tmpdir: /cluster/scratch/ diff --git a/ref/.gitignore b/ref/.gitignore new file mode 100644 index 0000000..4a23999 --- /dev/null +++ b/ref/.gitignore @@ -0,0 +1,2 @@ +GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play +gnomAD_v4.1 \ No newline at end of file diff --git a/tools/SComatic/RNAediting/AllEditingSites.hg38.txt.gz b/ref/AllEditingSites.hg38.txt.gz similarity index 100% rename from tools/SComatic/RNAediting/AllEditingSites.hg38.txt.gz rename to ref/AllEditingSites.hg38.txt.gz diff --git a/tools/SComatic/PoNs/PoN.scRNAseq.hg38.tsv.gz b/ref/PoN.scRNAseq.hg38.tsv.gz similarity index 100% rename from tools/SComatic/PoNs/PoN.scRNAseq.hg38.tsv.gz rename to ref/PoN.scRNAseq.hg38.tsv.gz diff --git a/run_LongSom.sh b/run_LongSom.sh new file mode 100755 index 0000000..23da61a --- /dev/null +++ b/run_LongSom.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +OUTPUT_DIR=/path/to/output_dir +REF_DIR=path/to/LongSom/ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir + +snakemake \ + -s workflow/LongSom.smk \ + --configfile config/config.yaml \ + --use-conda \ + --use-singularity \ + --singularity-args "-B ${OUTPUT_DIR}:/output -B /tmp:/tmp -B ${REF_DIR}:/ref" \ + --show-failed-logs \ + --rerun-incomplete \ + -p \ + + diff --git a/run_LongSom_slurm.sh b/run_LongSom_slurm.sh new file mode 100755 index 0000000..0a49dfd --- /dev/null +++ b/run_LongSom_slurm.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +OUTPUT_DIR=/path/to/output_dir +REF_DIR=REF_DIR=path/to/LongSom/ref/GRCh38_gencode_v44_CTAT_lib_Oct292023.plug-n-play/ctat_genome_lib_build_dir + +mkdir -p logs +mkdir -p ${OUTPUT_DIR} +sbatch \ + --time=20:00:00 \ + -o logs/snakelog.out \ + -e logs/snakelog.err \ +snakemake \ + -s workflow/Snakefile \ + --configfile workflow/test.yaml \ + --profile profile/ \ + --use-conda \ + --use-singularity \ + --singularity-args "-B ${OUTPUT_DIR}:/output -B /tmp:/tmp -B ${REF_DIR}:/ref" \ + --show-failed-logs \ + --rerun-incomplete \ + -p \ + "$@" + diff --git a/run_SComatic.sh b/run_SComatic.sh deleted file mode 100755 index d6dca34..0000000 --- a/run_SComatic.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash -mkdir -p logs -sbatch \ - --mem-per-cpu=2000 \ - --time=20:00:00 \ - -o logs/snakelog.$(date +%Y-%m-%d.%H-%M-%S).out \ - -e logs/snakelog.$(date +%Y-%m-%d.%H-%M-%S).err \ -snakemake \ - -s snakefiles/LongSom.smk \ - --configfile config/config_OvCa_LR.yml \ - --profile profile_simple/ \ - --sdm conda \ - --latency-wait 30 \ - --show-failed-logs \ - --rerun-incomplete \ - -p \ - --rerun-triggers mtime \ - "$@" diff --git a/snakefiles/BnpC.smk b/snakefiles/BnpC.smk deleted file mode 100644 index 5541bd9..0000000 --- a/snakefiles/BnpC.smk +++ /dev/null @@ -1,80 +0,0 @@ -import pandas as pd -CTYPES = config['SNVCalling']['celltypes'] -OUTDIR=config['Global']['outdir'] -OUTBNPC=config['BnpC']['outdir'] -DATA=config['Global']['data'] -IDS=config['Global']['ids'] -BNPC_PATH=config['Global']['bnpc'] -SCDNA=config['Run']['scdna'] -#include: 'SNVCalling.smk' - -rule all_BnpC: - input: - expand(f"{OUTDIR}/BnpC/{OUTBNPC}/{{id}}/genoCluster_posterior_mean_raw.pdf", - id=IDS), - expand(f"{OUTDIR}/SNVCalling/ClusterMap/{{id}}.ClusterMap.Reannotation.pdf", - id=IDS), - expand(f"{OUTDIR}/SNVCalling/Annotations/{{id}}.hg38_multianno.txt", - id=IDS), - -rule FormatInputBnpC: - input: - bin=f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.BinaryMatrix.tsv", - vaf=f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.VAFMatrix.tsv", - scDNA=f"{OUTDIR}/scDNAValidation/CloneGenotype/LongSom/{{id}}.CloneGenotype.tsv", #if SCDNA else [], - ctypes=f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv", - output: - bin=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.BinaryMatrix.tsv", - vaf=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.VAFMatrix.tsv", - scDNA=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.scDNACloneGenotype.tsv", - ctypes=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.Barcodes.tsv", - conda: - "envs/BnpC.yml" - resources: - time = 1200, - mem_mb=2000 - params: - bnpc = BNPC_PATH, - outdir=f"{OUTDIR}/BnpC/BnpC_input/", - min_cells = config['BnpC']['min_cells_per_mut'], - min_cov = config['BnpC']['min_pos_cov'] - shell: - "python {params.bnpc}/libs/format_input.py --bin {input.bin} " - "--vaf {input.vaf} --scDNA {input.scDNA} --ctypes {input.ctypes} " - "--min_pos_cov {params.min_cov} --min_cells_per_mut {params.min_cells} " - "--outfile {params.outdir}/{wildcards.id}" - - -rule BnpC_clones: - input: - bin=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.BinaryMatrix.tsv", - vaf=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.VAFMatrix.tsv", - scDNA=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.scDNACloneGenotype.tsv", - ctypes=f"{OUTDIR}/BnpC/BnpC_input/{{id}}.Barcodes.tsv", - output: - f"{OUTDIR}/BnpC/{OUTBNPC}/{{id}}/genoCluster_posterior_mean_raw.pdf" - threads: - 16 - resources: - time = 1200, - mem_mb=2000 - params: - bnpc = BNPC_PATH, - outdir=f"{OUTDIR}/BnpC/{OUTBNPC}/", - mcmc_steps = config['BnpC']['mcmc_steps'], - estimator = config['BnpC']['estimator'], - dpa = lambda w: config['BnpC']['dpa'][w.get("id")], - cup = config['BnpC']['cup'], - eup = config['BnpC']['eup'], - FP = config['BnpC']['FP'], - FN = config['BnpC']['FN'], - pp= config['BnpC']['pp'], - mut_order= lambda w: config['BnpC']['mut_order'][w.get("id")], - conda: - "envs/BnpC.yml" - shell: - "python {params.bnpc}/run_BnpC.py {input.bin} -n {threads} " - "-o {params.outdir}/{wildcards.id} -s {params.mcmc_steps} " - "-e {params.estimator} -cup {params.cup} -eup {params.eup} " - "-FP {params.FP} -FN {params.FN} -pp {params.pp} -ap {params.dpa} " - "--ctypes {input.ctypes} --scDNA {input.scDNA} --mut_order {params.mut_order}" diff --git a/snakefiles/CTATFusion.smk b/snakefiles/CTATFusion.smk deleted file mode 100644 index 7f9cead..0000000 --- a/snakefiles/CTATFusion.smk +++ /dev/null @@ -1,27 +0,0 @@ -OUTDIR=config['Global']['outdir'] -DATA=config['Global']['data'] -IDS=config['Global']['ids'] - - -rule all_CTATFusion: - input: - expand(f'{OUTDIR}/CTATFusion/{{id}}.fusion_of_interest.tsv', id=IDS) - - -rule CTATFusion: - input: - fastq = f'{DATA}/fastq/{{id}}.fastq.gz' - output: - f'{OUTDIR}/CTATFusion/{{id}}.fusion_of_interest.tsv' - threads: 32 - resources: - time = 1200, - mem_mb=97000 - singularity: - 'ctat_lr_fusion.v0.10.0.simg' - shell: - "ctat-LR-fusion " - "-T /data/fastq/{wildcards.id}_rawdedup.test.fa " - "--output {wildcards.id} " - "--genome_lib_dir /ref " - "--CPU {threads} --vis" diff --git a/snakefiles/CellTypeReannotation.smk b/snakefiles/CellTypeReannotation.smk deleted file mode 100644 index 90eb4ae..0000000 --- a/snakefiles/CellTypeReannotation.smk +++ /dev/null @@ -1,285 +0,0 @@ -### This snakemake pipeline first finds the High Confidence Cancer variants (HCCV) -### Then cells are reannotated based on their SNV/fusion HCCV mutation status - -import pandas as pd -CTYPES_REANNO = config['CellTypeReannotation']['celltypes'] -OUTDIR=config['Global']['outdir'] -DATA=config['Global']['data'] -IDS=config['Global']['ids'] -SCOMATIC_PATH=config['Global']['scomatic'] -CTATFUSION=config['Run']['ctatfusion'] - -#include: 'SComaticPoN.smk' -#include: 'CTATFusion.smk' - -def get_BetaBinEstimates(input, value): - df = pd.read_csv(input, sep='\t') - d = df.squeeze().to_dict() - return d[value] - -rule all_reanno: - input: - expand(f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv", id=IDS), - expand(f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step3.tsv", id=IDS), - -rule SplitBam_Reanno: - input: - bam = f"{DATA}/bam/{{id}}.bam", - bai = f"{DATA}/bam/{{id}}.bam.bai", - barcodes = f"{DATA}/ctypes/{{id}}.txt" - output: - expand("{OUTDIR}/CellTypeReannotation/SplitBam/{{id}}.{celltype}.bam", - celltype=CTYPES_REANNO, OUTDIR=[OUTDIR]) - resources: - mem_mb = 4096 - conda: - "envs/SComatic.yml" - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/CellTypeReannotation/SplitBam", - mapq=config['SComatic']['BaseCellCounter']['min_mapping_quality'] - shell: - "python {params.scomatic}/SplitBam/SplitBamCellTypes.py " - "--bam {input.bam} --meta {input.barcodes} --id {wildcards.id} " - "--outdir {params.outdir} --min_MQ {params.mapq}" - -rule BaseCellCounter_Reanno: - input: - bam=f"{OUTDIR}/CellTypeReannotation/SplitBam/{{id}}.{{celltype}}.bam" - output: - tsv=f"{OUTDIR}/CellTypeReannotation/BaseCellCounter/{{id}}/{{id}}.{{celltype}}.tsv", - tmp=temp(directory(f"{OUTDIR}/CellTypeReannotation/BaseCellCounter/{{id}}/temp_{{celltype}}/")) - threads: - 32 - resources: - time = 1200, - mem_mb = 1024 - conda: - "envs/SComatic.yml" - params: - outdir=f"{OUTDIR}/CellTypeReannotation/BaseCellCounter", - scomatic=SCOMATIC_PATH, - hg38=config['Global']['genome'], - chrom=config['SComatic']['BaseCellCounter']['chromosomes'], - mapq=config['SComatic']['BaseCellCounter']['min_mapping_quality'], - shell: - "python {params.scomatic}/BaseCellCounter/BaseCellCounter.py " - "--bam {input.bam} --ref {params.hg38} --chrom {params.chrom} " - "--out_folder {params.outdir}/{wildcards.id}/ --nprocs {threads} " - "--min_mq {params.mapq} --tmp_dir {output.tmp} " - -rule MergeCounts_Reanno: - input: - expand("{OUTDIR}/CellTypeReannotation/BaseCellCounter/{{id}}/{{id}}.{celltype}.tsv", - celltype=CTYPES_REANNO, OUTDIR=[OUTDIR]) - output: - tsv = f"{OUTDIR}/CellTypeReannotation/MergeCounts/{{id}}.BaseCellCounts.AllCellTypes.tsv" - resources: - time = 120, - mem_mb = 4096 - conda: - "envs/SComatic.yml" - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/CellTypeReannotation/BaseCellCounter", - shell: - "python {params.scomatic}/MergeCounts/MergeBaseCellCounts.py " - "--tsv_folder {params.outdir}/{wildcards.id}/ --outfile {output.tsv}" - -rule BaseCellCalling_step1_Reanno: - input: - bb = f"{OUTDIR}/PoN/PoN/BetaBinEstimates.txt", - tsv = f"{OUTDIR}/CellTypeReannotation/MergeCounts/{{id}}.BaseCellCounts.AllCellTypes.tsv" - output: - f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step1.tsv" - conda: - "envs/SComatic.yml" - resources: - time = 120, - mem_mb = 4096 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/CellTypeReannotation/BaseCellCalling", - hg38=config['Global']['genome'], - min_cell_types = config['SComatic']['BaseCellCalling']['Min_cell_types'], - alpha1 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha1'), - beta1 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta1'), - alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), - beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), - shell: - "python {params.scomatic}/BaseCellCalling/BaseCellCalling.step1.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} " - "--ref {params.hg38} --min_cell_types {params.min_cell_types} " - "--alpha1 {params.alpha1} --beta1 {params.beta1} " - "--alpha2 {params.alpha2} --beta2 {params.beta2} " - -rule BaseCellCalling_step2_Reanno: - input: - tsv = f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step1.tsv", - pon_LR = f"{OUTDIR}/PoN/PoN/PoN_LR.tsv" - output: - f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step2.tsv" - conda: - "envs/SComatic.yml" - resources: - time = 120, - mem_mb = 4096 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/CellTypeReannotation/BaseCellCalling", - RNA_editing = config['SComatic']['BaseCellCalling']['RNA_editing'], - min_distance = config['SComatic']['BaseCellCalling']['min_distance'], - pon_SR = config['SComatic']['BaseCellCalling']['PoN_SR'], - gnomAD_db = config['SComatic']['BaseCellCalling']['gnomAD_db'], - max_gnomAD_VAF = config['SComatic']['BaseCellCalling']['max_gnomAD_VAF'], - shell: - "python {params.scomatic}/BaseCellCalling/BaseCellCalling.step2.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} " - "--editing {params.RNA_editing} --min_distance {params.min_distance} " - "--pon_SR {params.pon_SR} --pon_LR {input.pon_LR} " - "--gnomAD_db {params.gnomAD_db} --gnomAD_max {params.max_gnomAD_VAF}" - - -rule HighConfidenceCancerVariants: - input: - tsv = f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step2.tsv" - output: - f"{OUTDIR}/CellTypeReannotation/HCCV/{{id}}.HCCV.tsv" - conda: - "envs/SComatic.yml" - resources: - time = 120, - mem_mb = 4096 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/CellTypeReannotation/HCCV", - min_dp=config['CellTypeReannotation']['HCCV']['min_depth'], - deltaVAF=config['CellTypeReannotation']['HCCV']['deltaVAF'], - deltaCCF=config['CellTypeReannotation']['HCCV']['deltaCCF'], - cancer = config['CellTypeReannotation']['cancer_ctype'], - noncancer = config['CellTypeReannotation']['noncancer_ctype'], - shell: - "python {params.scomatic}/HighConfidenceCancerVariants/HighConfidenceCancerVariants.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} " - "--min_dp {params.min_dp} --deltaVAF {params.deltaVAF} --deltaCCF {params.deltaCCF} " - "--cancer_ctype {params.cancer} --noncancer_ctype {params.noncancer}" - -rule HCCVSingleCellGenotype: - input: - tsv = f"{OUTDIR}/CellTypeReannotation/HCCV/{{id}}.HCCV.tsv", - bam = f"{DATA}/bam/{{id}}.bam", - barcodes = f"{DATA}/ctypes/{{id}}.txt", - bb = f"{OUTDIR}/PoN/PoN/BetaBinEstimates.txt", - output: - tsv=f"{OUTDIR}/CellTypeReannotation/HCCV/{{id}}.SingleCellGenotype.tsv", - tmp=temp(directory(f"{OUTDIR}/CellTypeReannotation/HCCV/{{id}}/")) - conda: - "envs/SComatic.yml" - threads: - 32 - resources: - time = 120, - mem_mb = 4096 - params: - scomatic=SCOMATIC_PATH, - hg38=config['Global']['genome'], - alt_flag= config['SComatic']['SingleCellGenotype']['alt_flag'], - mapq=config['SComatic']['BaseCellCounter']['min_mapping_quality'], - alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), - beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), - pval = config['SComatic']['SingleCellGenotype']['pvalue'], - chrm_conta = config['SComatic']['chrM_contaminant'] - shell: - "python {params.scomatic}/HighConfidenceCancerVariants/HCCVSingleCellGenotype.py " - "--bam {input.bam} --infile {input.tsv} --outfile {output.tsv} " - "--meta {input.barcodes} --alt_flag {params.alt_flag} --ref {params.hg38} " - "--nprocs {threads} --min_mq {params.mapq} --pvalue {params.pval} " - "--alpha2 {params.alpha2} --beta2 {params.beta2} " - "--chrM_contaminant {params.chrm_conta} --tmp_dir {output.tmp}" - -rule CellTypeReannotation: - input: - SNVs = f"{OUTDIR}/CellTypeReannotation/HCCV/{{id}}.SingleCellGenotype.tsv", - fusions = f'{OUTDIR}/CTATFusion/{{id}}.fusion_of_interest.tsv' if CTATFUSION else [], - barcodes = f"{DATA}/ctypes/{{id}}.txt" - output: - f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv" - resources: - time = 1200, - mem_mb = 4096 - conda: - "envs/SComatic.yml" - params: - scomatic=SCOMATIC_PATH, - min_variants = config['CellTypeReannotation']['Reannotation']['min_variants'], - min_frac = config['CellTypeReannotation']['Reannotation']['min_fraction'] - shell: - "python {params.scomatic}/CellTypeReannotation/CellTypeReannotation.py " - "--SNVs {input.SNVs} --fusions {input.fusions} " - "--outfile {output} --meta {input.barcodes} " - "--min_variants {params.min_variants} --min_frac {params.min_frac}" - -rule BaseCellCalling_step3_Reanno: - input: - tsv = f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step2.tsv" - output: - f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step3.tsv" - conda: - "envs/SComatic.yml" - resources: - time = 120, - mem_mb = 4096 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/CellTypeReannotation/BaseCellCalling", - deltaVAF=config['SComatic']['BaseCellCalling']['deltaVAF'], - deltaCCF=config['SComatic']['BaseCellCalling']['deltaCCF'], - cancer = config['CellTypeReannotation']['cancer_ctype'], - chrm_conta = config['CellTypeReannotation']['chrM_contaminant'], - min_ac_reads = config['SNVCalling']['min_ac_reads'], - clust_dist = config['SNVCalling']['clust_dist'], - shell: - "python {params.scomatic}/BaseCellCalling/BaseCellCalling.step3.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} --chrM_contaminant {params.chrm_conta} " - "--deltaVAF {params.deltaVAF} --deltaCCF {params.deltaCCF} --cancer_ctype {params.cancer} " - "--min_ac_reads {params.min_ac_reads} --clust_dist {params.clust_dist} " - - -rule SingleCellGenotype_Reanno: - input: - tsv = f"{OUTDIR}/CellTypeReannotation/BaseCellCalling/{{id}}.calling.step3.tsv", - bam = f"{DATA}/bam/{{id}}.bam", - barcodes = f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv", - bb = f"{OUTDIR}/PoN/PoN/BetaBinEstimates.txt", - fusions = f'{OUTDIR}/CTATFusion/{{id}}.fusion_of_interest.tsv' if CTATFUSION else [], - output: - tsv=f"{OUTDIR}/CellTypeReannotation/SingleCellGenotype/{{id}}.SingleCellGenotype.tsv", - dp=f"{OUTDIR}/CellTypeReannotation/SingleCellGenotype/{{id}}.DpMatrix.tsv", - alt=f"{OUTDIR}/CellTypeReannotation/SingleCellGenotype/{{id}}.AltMatrix.tsv", - vaf=f"{OUTDIR}/CellTypeReannotation/SingleCellGenotype/{{id}}.VAFMatrix.tsv", - bin=f"{OUTDIR}/CellTypeReannotation/SingleCellGenotype/{{id}}.BinaryMatrix.tsv", - tmp=temp(directory(f"{OUTDIR}/CellTypeReannotation/SingleCellGenotype/{{id}}/")) - conda: - "envs/SComatic.yml" - threads: - 32 - resources: - time = 120, - mem_mb = 4096 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/CellTypeReannotation/SingleCellGenotype", - hg38=config['Global']['genome'], - alt_flag= config['SComatic']['SingleCellGenotype']['alt_flag'], - mapq=config['SComatic']['BaseCellCounter']['min_mapping_quality'], - alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), - beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), - pval = config['SComatic']['SingleCellGenotype']['pvalue'], - chrm_conta = config['SComatic']['chrM_contaminant'], - shell: - "python {params.scomatic}/SingleCellGenotype/SingleCellGenotype.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} " - "--bam {input.bam} --meta {input.barcodes} --ref {params.hg38} --fusions {input.fusions} " - "--nprocs {threads} --min_mq {params.mapq} --pvalue {params.pval} " - "--alpha2 {params.alpha2} --beta2 {params.beta2} --alt_flag {params.alt_flag} " - "--chrM_contaminant {params.chrm_conta} --tmp_dir {output.tmp}" diff --git a/snakefiles/LongSom.smk b/snakefiles/LongSom.smk deleted file mode 100644 index 5923b8f..0000000 --- a/snakefiles/LongSom.smk +++ /dev/null @@ -1,40 +0,0 @@ -include: 'BnpC.smk' -include: 'CellTypeReannotation.smk' -include: 'CTATFusion.smk' -#include: 'FastqBarcoding.smk' -include: 'InferCopyNumbers.smk' -include: 'SComaticPoN.smk' -include: 'SNVCalling.smk' -# Global -OUTDIR=config['Global']['outdir'] -OUTBNPC=config['BnpC']['outdir'] -IDS=config['Global']['ids'] -# Run -REANNO=config['Run']['reanno'] -SCOMATIC=config['Run']['scomatic'] -ANNOVAR=config['Run']['annovar'] -BNPC=config['Run']['bnpc'] -CTATFUSION=config['Run']['ctatfusion'] -INFERCNV=config['Run']['infercnv'] -PLOT=config['Run']['plot'] -SCDNA=config['Run']['scdna'] - -rule all: - input: - # Reannotated cell types - expand(f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv", id=IDS) if REANNO else [], - # Final SNV set - expand(f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step3.tsv", id=IDS) if SCOMATIC else [], - # SNVs annotation - expand(f"{OUTDIR}/SNVCalling/Annotations/{{id}}.hg38_multianno.txt", id=IDS) if ANNOVAR else [], - # Cell-Mutation matrix - expand(f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.BinaryMatrix.tsv", id=IDS)if SCOMATIC else [], - # Clonal reconstruction - expand(f"{OUTDIR}/BnpC/{OUTBNPC}/{{id}}/genoCluster_posterior_mean_raw.pdf", id=IDS) if BNPC else [], - # Fusions - expand(f'{OUTDIR}/CTATFusion/{{id}}.fusion_of_interest.tsv', id=IDS) if CTATFUSION else [], - # CNVs - expand(f"{OUTDIR}/InferCNV/InferCNV/{{id}}/infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.png", id = IDS) if INFERCNV else [], - # Plots - expand(f"{OUTDIR}/SNVCalling/ClusterMap/{{id}}.ClusterMap.Reannotation.pdf", id=IDS) if PLOT else [], - default_target: True diff --git a/snakefiles/SNVCalling.smk b/snakefiles/SNVCalling.smk deleted file mode 100644 index 53eeef7..0000000 --- a/snakefiles/SNVCalling.smk +++ /dev/null @@ -1,237 +0,0 @@ -import pandas as pd -CTYPES = config['SNVCalling']['celltypes'] -OUTDIR=config['Global']['outdir'] -DATA=config['Global']['data'] -IDS=config['Global']['ids'] -SCOMATIC_PATH=config['Global']['scomatic'] -CTATFUSION =config['Run']['ctatfusion'] - -#include: 'CellTypeReannotation.smk' - -import pandas as pd -def get_BetaBinEstimates(input, value): - df = pd.read_csv(input, sep='\t') - d = df.squeeze().to_dict() - return d[value] - -rule all_SNVCalling: - input: - expand(f"{OUTDIR}/SNVCalling/ClusterMap/{{id}}.ClusterMap.Reannotation.pdf", - id=IDS), - expand(f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step3.tsv", - id=IDS), - -rule PatientSpecificPoN: - input: - f"{OUTDIR}/PoN/PoN/PoN_LR.tsv" - output: - f"{OUTDIR}/PoN/PoN/{{id}}.PoN_LR.tsv" - shell: - "grep -v {wildcards.id} {input} > {output}" - -rule SplitBam: - input: - bam = f"{DATA}/bam/{{id}}.bam", - bai = f"{DATA}/bam/{{id}}.bam.bai", - barcodes = f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv", - output: - expand("{OUTDIR}/SNVCalling/SplitBam/{{id}}.{celltype}.bam", - celltype=CTYPES, OUTDIR=[OUTDIR]) - resources: - mem_mb = 32000 - conda: - "envs/SComatic.yml" - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/SNVCalling/SplitBam", - mapq=config['SComatic']['BaseCellCounter']['min_mapping_quality'] - shell: - "python {params.scomatic}/SplitBam/SplitBamCellTypes.py " - "--bam {input.bam} --meta {input.barcodes} --id {wildcards.id} " - "--outdir {params.outdir} --min_MQ {params.mapq}" - -rule BaseCellCounter: - input: - bam=f"{OUTDIR}/SNVCalling/SplitBam/{{id}}.{{celltype}}.bam" - output: - tsv=f"{OUTDIR}/SNVCalling/BaseCellCounter/{{id}}/{{id}}.{{celltype}}.tsv", - tmp=temp(directory(f"{OUTDIR}/SNVCalling/BaseCellCounter/{{id}}/temp_{{celltype}}/")) - threads: - 32 - resources: - time = 1200, - mem_mb = 1000 - conda: - "envs/SComatic.yml" - params: - outdir=f"{OUTDIR}/SNVCalling/BaseCellCounter", - scomatic=SCOMATIC_PATH, - hg38=config['Global']['genome'], - chrom=config['SComatic']['BaseCellCounter']['chromosomes'], - mapq=config['SComatic']['BaseCellCounter']['min_mapping_quality'], - shell: - "python {params.scomatic}/BaseCellCounter/BaseCellCounter.py " - "--bam {input.bam} --ref {params.hg38} --chrom {params.chrom} " - "--out_folder {params.outdir}/{wildcards.id}/ --nprocs {threads} " - "--min_mq {params.mapq} --tmp_dir {output.tmp} " - -rule MergeCounts: - input: - expand("{OUTDIR}/SNVCalling/BaseCellCounter/{{id}}/{{id}}.{celltype}.tsv", - celltype=CTYPES, OUTDIR=[OUTDIR]) - output: - tsv = f"{OUTDIR}/SNVCalling/MergeCounts/{{id}}.BaseCellCounts.AllCellTypes.tsv" - resources: - time = 120, - mem_mb = 8000 - conda: - "envs/SComatic.yml" - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/SNVCalling/BaseCellCounter", - shell: - "python {params.scomatic}/MergeCounts/MergeBaseCellCounts.py " - "--tsv_folder {params.outdir}/{wildcards.id}/ --outfile {output.tsv}" - -rule BaseCellCalling_step1: - input: - bb = f"{OUTDIR}/PoN/PoN/BetaBinEstimates.txt", - tsv = f"{OUTDIR}/SNVCalling/MergeCounts/{{id}}.BaseCellCounts.AllCellTypes.tsv" - output: - f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step1.tsv" - conda: - "envs/SComatic.yml" - resources: - time = 120, - mem_mb = 8000 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/SNVCalling/BaseCellCalling", - hg38=config['Global']['genome'], - min_cell_types = config['SComatic']['BaseCellCalling']['Min_cell_types'], - alpha1 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha1'), - beta1 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta1'), - alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), - beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), - min_ac_reads = config['SComatic']['BaseCellCalling']['min_ac_reads'], - min_ac_cells = config['SComatic']['BaseCellCalling']['min_ac_cells'], - shell: - "python {params.scomatic}/BaseCellCalling/BaseCellCalling.step1.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} " - "--ref {params.hg38} --min_cell_types {params.min_cell_types} " - "--min_ac_reads {params.min_ac_reads} --min_ac_cells {params.min_ac_cells} " - "--alpha1 {params.alpha1} --beta1 {params.beta1} " - "--alpha2 {params.alpha2} --beta2 {params.beta2} " - -rule BaseCellCalling_step2: - input: - tsv = f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step1.tsv", - pon_LR = f"{OUTDIR}/PoN/PoN/{{id}}.PoN_LR.tsv" - output: - f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step2.tsv" - conda: - "envs/SComatic.yml" - resources: - time = 120, - mem_mb = 8000 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/SNVCalling/BaseCellCalling", - RNA_editing = config['SComatic']['BaseCellCalling']['RNA_editing'], - min_distance = config['SComatic']['BaseCellCalling']['min_distance'], - pon_SR = config['SComatic']['BaseCellCalling']['PoN_SR'], - gnomAD_db = config['SComatic']['BaseCellCalling']['gnomAD_db'], - max_gnomAD_VAF = config['SComatic']['BaseCellCalling']['max_gnomAD_VAF'], - shell: - "python {params.scomatic}/BaseCellCalling/BaseCellCalling.step2.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} " - "--editing {params.RNA_editing} --min_distance {params.min_distance} " - "--pon_SR {params.pon_SR} --pon_LR {input.pon_LR} " - "--gnomAD_db {params.gnomAD_db} --gnomAD_max {params.max_gnomAD_VAF}" - - -rule BaseCellCalling_step3: - input: - tsv = f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step2.tsv" - output: - f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step3.tsv" - conda: - "envs/SComatic.yml" - resources: - time = 120, - mem_mb = 8000 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/SNVCalling/BaseCellCalling", - deltaVAF=config['SComatic']['BaseCellCalling']['deltaVAF'], - deltaCCF=config['SComatic']['BaseCellCalling']['deltaCCF'], - cancer = config['SNVCalling']['cancer_ctype'], - chrm_conta = config['SComatic']['chrM_contaminant'], - min_ac_reads = config['SNVCalling']['min_ac_reads'], - min_ac_cells = config['SNVCalling']['min_ac_cells'], - clust_dist = config['SNVCalling']['clust_dist'], - shell: - "python {params.scomatic}/BaseCellCalling/BaseCellCalling.step3.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} --chrM_contaminant {params.chrm_conta} " - "--deltaVAF {params.deltaVAF} --deltaCCF {params.deltaCCF} --cancer_ctype {params.cancer} " - "--min_ac_reads {params.min_ac_reads} --min_ac_cells {params.min_ac_cells} " - "--clust_dist {params.clust_dist} " - -rule SingleCellGenotype: - input: - tsv = f"{OUTDIR}/SNVCalling/BaseCellCalling/{{id}}.calling.step3.tsv", - bam = f"{DATA}/bam/{{id}}.bam", - barcodes = f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv", - bb = f"{OUTDIR}/PoN/PoN/BetaBinEstimates.txt", - fusions = f'{OUTDIR}/CTATFusion/{{id}}.fusion_of_interest.tsv' if CTATFUSION else [], - output: - tsv=f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.SingleCellGenotype.tsv", - dp=f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.DpMatrix.tsv", - alt=f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.AltMatrix.tsv", - vaf=f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.VAFMatrix.tsv", - bin=f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.BinaryMatrix.tsv", - tmp=temp(directory(f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}/")) - conda: - "envs/SComatic.yml" - threads: - 32 - resources: - time = 120, - mem_mb = 1000 - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/SNVCalling/SingleCellGenotype", - hg38=config['Global']['genome'], - alt_flag= config['SComatic']['SingleCellGenotype']['alt_flag'], - mapq=config['SComatic']['BaseCellCounter']['min_mapping_quality'], - alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), - beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), - pval = config['SComatic']['SingleCellGenotype']['pvalue'], - chrm_conta = config['SComatic']['chrM_contaminant'], - shell: - "python {params.scomatic}/SingleCellGenotype/SingleCellGenotype.py " - "--infile {input.tsv} --outfile {params.outdir}/{wildcards.id} " - "--bam {input.bam} --meta {input.barcodes} --ref {params.hg38} --fusions {input.fusions} " - "--nprocs {threads} --min_mq {params.mapq} --pvalue {params.pval} " - "--alpha2 {params.alpha2} --beta2 {params.beta2} --alt_flag {params.alt_flag} " - "--chrM_contaminant {params.chrm_conta} --tmp_dir {output.tmp}" - -rule clustermap: - input: - bin = f"{OUTDIR}/SNVCalling/SingleCellGenotype/{{id}}.BinaryMatrix.tsv", - ctypes = f"{OUTDIR}/CellTypeReannotation/ReannotatedCellTypes/{{id}}.tsv" - output: - f"{OUTDIR}/SNVCalling/ClusterMap/{{id}}.ClusterMap.Reannotation.pdf", - f"{OUTDIR}/SNVCalling/ClusterMap/{{id}}.ClusterMap.NoReannotation.pdf", - conda: - "envs/BnpC.yml" - params: - scomatic=SCOMATIC_PATH, - outdir=f"{OUTDIR}/SNVCalling/ClusterMap", - height = config['SNVCalling']['clustermap']['height'], - width = config['SNVCalling']['clustermap']['width'], - shell: - "python {params.scomatic}/ClusterMap/ClusterMap.py " - "--bin {input.bin} --ctypes {input.ctypes} " - "--height {params.height} --width {params.width} " - "--out_dir {params.outdir}/{wildcards.id}" diff --git a/tools/BnpC/.gitignore b/tools/BnpC/.gitignore deleted file mode 100644 index bb989c2..0000000 --- a/tools/BnpC/.gitignore +++ /dev/null @@ -1,132 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# BnpC specific -example_data/* \ No newline at end of file diff --git a/tools/BnpC/README.md b/tools/BnpC/README.md deleted file mode 100644 index 61d7a06..0000000 --- a/tools/BnpC/README.md +++ /dev/null @@ -1,130 +0,0 @@ -# BnpC -Bayesian non-parametric clustering (BnpC) of binary data with missing values and uneven error rates. - -BnpC is a novel non-parametric method to cluster individual cells into clones and infer their genotypes based on their noisy mutation profiles. -BnpC employs a Chinese Restaurant Process prior to handle the unknown number of clonal populations. The model introduces a combination of Gibbs sampling, a modified non-conjugate split-merge move and Metropolis-Hastings updates to explore the joint posterior space of all parameters. Furthermore, it employs a novel estimator, which accounts for the shape of the posterior distribution, to predict the clones and genotypes. - -The corresponsing paper can be found in [Bioinformatics](https://doi.org/10.1093/bioinformatics/btaa599 "Borgsmueller et al.") - -# Contents -- [Installation](#Installation) -- [Usage](#Usage) -- [Example data](#Example-data) - -# Requirements -- Python 3.X - -# Installation -## Clone repository -First, download BnpC from github and change to the directory: -```bash -git clone https://github.com/cbg-ethz/BnpC -cd BnpC -``` - -## Create conda environment (optional) -First, create a new environment named "BnpC": -```bash -conda create --name BnpC python=3 -``` - -Second, source it: -```bash -conda activate BnpC -``` - -## Install requirements -Use pip to install the requirements: -```bash -python -m pip install -r requirements.txt -``` - -Now you are ready to run **BnpC**! - -# Usage -The BnpC wrapper script `run_BnpC.py` can be run with the following shell command: -```bash -python run_BnpC.py [-t] [-FN] [-FP] [-FN_m] [-FN_sd] [-FP_m] [-FP_sd] [-dpa] [-pp] [-n] [-s] [-r] [-ls] [-b] [-smp] [-cup] [-e] [-sc] [--seed] [-o] [-v] [-np] [-tr] [-tc] [-td]] -``` - -## Input -BnpC requires a binary matrix as input, where each row corresponds with a mutations and each columns with a cell. -All matrix entries must be of the following: 0|1|3/" ", where 0 indicates the absence of a mutation, 1 the presence, and a 3 or empty element a missing value. - -> ## Note -> If your data is arranged in the transposed way (cells = columns, rows = mutations), use the `-t` argument. - -## Arguments -### Input Data Arguments -- ``, Path to the input data. -- `-t `, If set, the input matrix is transposed. - -### Optional input arguments (for simulated data) -- `-tr `, Path to the mutation tree file (in .gv format) used for data generation. -- `-tc `, Path to the true clusters assignments to compare clustering methods. -- `-td `, Path to the true/raw data/genotypes. - -### Model Arguments -- `-FN `, Replace with the fixed error rate for false negatives. -- `-FP `, Replace with the fixed error rate for false positives. -- `-FN_m `, Replace with the mean for the prior for the false negative rate. -- `-FN_sd `, Replace with the standard deviation for the prior for the false negative rate. -- `-FP_m `, Replace with the mean for the prior for the false positive rate. -- `-FP_sd `, Replace with the standard deviation for the prior for the false positive rate. -- `-ap `, Alpha value of the Beta function used as prior for the concentration parameter of the CRP. -- `-pp `, Beta function shape parameters used for the cluster parameter prior. - -> ## Note -> If you run BnpC on panel data with **few mutation** only or on **error free** data, we recommend changing the `-pp` argument to beta distribution closer to uniform, like `-pp 0.75 0.75` or even `-pp 1 1`. Otherwise, BnpC will incorrectly report many singleton clusters. - -### MCMC Arguments -- `-n `, Number of MCMC chains to run in parallel (1 chain per thread). -- `-s `, Number of MCMC steps. -- `-r `, Runtime in minutes. If set, steps argument is overwritten. -- `-ls `, Lugsail batch means estimator as convergence diagnostics [Vats and Flegal, 2018]. -- `-b `, Ratio of MCMC steps discarded as burn-in. -- `-cup `, Probability of updating the CRP concentration parameter. -- `-eup `, Probability to do update the error rates in An MCMC step. -- `-smp `, Probability to do a split/merge step instead of Gibbs sampling. -- `-sms `, Number of intermediate, restricted Gibbs steps in the split-merge move. -- `-smr `, Ratio of splits/merges in the split merge move. -- `-e +`, Estimator(s) for inferrence. If more than one, seperate by space. Options = posterior|ML|MAP. -- `-sc `, If set, infer a result for each chain individually (instead of from all chains together). -- `--seed `, Seed used for random number generation. - -### Output Arguments -- `-o `, Path to an output directory. -- `-np `, If set, no plots are generated. -- `-v `, Stdout verbosity level. Options = 0|1|2. - -# Example data - -Lets employ the toy dataset that one can find in the `data` folder (data.csv) to understand the functionality of the different arguments. First go to the folder and activate the environment: - - cd /path/to/crp_clustering - conda activate environment_name - -BnpC can run in three different settings: -1. Number of steps. Runs for the given number of MCMC steps. Arument: -s -2. Running time limit. Every MCMC the time is tracked and the method stops after the introduced time is achieved. Argument: -r -3. Lugsail for convergence diagnosis. The chain is terminated if the estimator undercuts a threshold defined by a significance level of 0.05 and a user defined float between [0,1], comparable to the half-width of the confidence interval in sample size calculation for a one sample t-test. Reasonal values = 0.1, 0.2, 0.3. Argument: -ls - -The simplest way to run the BnpC is to leave every argument as default and hence only the path to the data needs to be given. In this case BnpC runs in the setting 1. -```bash -python run_BnpC.py example_data/data.csv -``` -If the error rates are known for a particular sequenced data (e.g FP = 0.0001 and FN = 0.3), one can run BnpC with fixed error rates by: -```bash -python run_BnpC.py example_data/data.csv -FP 0.0001 -FN 0.3 -``` -On the other hand, if errors are not known one can leave it blank as in the first case or if there is some intuition add the mean and standard deviation priors for the method to learn them: -```bash -python run_BnpC.py example_data/data.csv -FP_m 0.0001 -FN_m 0.3 -FP_sd 0.000001 -FN_sd 0.05 -``` -Additional MCMC arguments can be employed to allow faster convergence. Among other options: -- Reduce burnin to include more posterior samples in the estimation. Example: -b 0.2, discard 20 % of the total MCMC steps. -- Adapt split-merge probability to better explore the posterior landscape. Example: -smp 0.33, 1 out of every 3 steps will be a split-merge move on average. -- Adjust the Dirchlet Process alpha which accounts for the probability of starting a new cluster. Example: -dpa 10. Increasing the value, leads to a larger probability of starting a new cluster in the cell assignment step. - - - diff --git a/tools/BnpC/requirements.txt b/tools/BnpC/requirements.txt deleted file mode 100644 index 419dfd7..0000000 --- a/tools/BnpC/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -numpy==1.26.4 -Bottleneck==1.3.5 -pandas==2.2.2 -scikit-learn==1.5.0 -scipy==1.14.0 -seaborn==0.12.0 diff --git a/tools/BnpC/requirements_plotting.txt b/tools/BnpC/requirements_plotting.txt deleted file mode 100644 index bb854aa..0000000 --- a/tools/BnpC/requirements_plotting.txt +++ /dev/null @@ -1,4 +0,0 @@ -numpy==1.26.4 -pandas==2.2.2 -seaborn==0.12.0 -pywaffle \ No newline at end of file diff --git a/tools/SComatic/PoNs/PoN.scRNAseq.hg38.no_chr_prefix.tsv.gz b/tools/SComatic/PoNs/PoN.scRNAseq.hg38.no_chr_prefix.tsv.gz deleted file mode 100644 index c34e3dc..0000000 Binary files a/tools/SComatic/PoNs/PoN.scRNAseq.hg38.no_chr_prefix.tsv.gz and /dev/null differ diff --git a/tools/SComatic/docs/Algorithm.jpeg b/tools/SComatic/docs/Algorithm.jpeg deleted file mode 100644 index 82cac0d..0000000 Binary files a/tools/SComatic/docs/Algorithm.jpeg and /dev/null differ diff --git a/tools/SComatic/docs/Granularity_plot.png b/tools/SComatic/docs/Granularity_plot.png deleted file mode 100644 index c39f6d5..0000000 Binary files a/tools/SComatic/docs/Granularity_plot.png and /dev/null differ diff --git a/tools/SComatic/docs/OtherFunctionalities.md b/tools/SComatic/docs/OtherFunctionalities.md deleted file mode 100644 index 8552b3a..0000000 --- a/tools/SComatic/docs/OtherFunctionalities.md +++ /dev/null @@ -1,174 +0,0 @@ -# Other SComatic functionalities - -## Computing the number of callable sites per cell type - -``` -python scripts/GetCallableSites/GetAllCallableSites.py --help -usage: GetAllCallableSites.py [-h] --infile INFILE --outfile OUTFILE - [--max_cov MAX_COV] - [--min_cell_types MIN_CELL_TYPES] - -Script to calculate the number of callable sites per cell type - -optional arguments: - -h, --help show this help message and exit - --infile INFILE Tsv file obtained by BaseCellCalling.step1.py to be - analysed - --outfile OUTFILE Out file prefix - --max_cov MAX_COV Maximum coverage to record in the callable sites - table. Greater values will be collapsed to the - provided one. [Default: 150] - --min_cell_types MIN_CELL_TYPES - Minimum number of cell types with enough - coverage/cells at a site to be considered as a - callable [Default: 2] -``` - -**Example:** check [here](/docs/SComaticExample.md) to see how to run this script with an example sample. - -## Computing the number of callable sites per single cell - -``` -python scripts/SitesPerCell/SitesPerCell.py --help -usage: SitesPerCell.py [-h] --bam BAM --ref REF [--infile INFILE] - [--min_ct1 MIN_CT1] [--min_ct2 MIN_CT2] - [--out_folder OUT_FOLDER] [--id ID] [--nprocs NPROCS] - [--bin BIN] [--min_dp MIN_DP] [--min_cc MIN_CC] - [--min_bq MIN_BQ] [--min_mq MIN_MQ] [--tmp_dir TMP_DIR] - -Script to calculate the number of callable sites per unique cell - -optional arguments: - -h, --help show this help message and exit - --bam BAM Tumor bam file to be analysed - --ref REF Reference genome. *fai must be available in the same - folder as reference - --infile INFILE Base calling file (obtained by - BaseCellCalling.step1.py) - --min_ct1 MIN_CT1 Minimum number of cell types with enough reads to - consider a genomic site. Default = 2 - --min_ct2 MIN_CT2 Minimum number of cell types with enough unique cell - counts to consider a genomic site. Default = 2 - --out_folder OUT_FOLDER - Out folder - --id ID Prefix of out file. If provided, please use next - format: *.[cell_type] . Example: sample1.t_cell. If - not provided, it is taken from bam file - --nprocs NPROCS Number of processes [Default = 1] - --bin BIN Bin size for running the analysis [Default 50000] - --min_dp MIN_DP Minimum coverage to consider the genomic site. Default - = 5 - --min_cc MIN_CC Minimum number of cells required to consider a genomic - site. Default = 5 - --min_bq MIN_BQ Minimum base quality permited for the base counts. - Default = 20 - --min_mq MIN_MQ Minimum mapping quality required to analyse read. - Default = 255 - --tmp_dir TMP_DIR Temporary folder for tmp files -``` - -**Example:** check [here](/docs/SComaticExample.md) to see how to run this script with an example sample. - -## Computing the genotype for each cell at the variant sites - -``` -python scripts/SingleCellGenotype/SingleCellGenotype.py --help -usage: SingleCellGenotype.py [-h] --bam BAM --infile INFILE --ref REF --meta - META [--out_file OUT_FILE] [--alt_flag {Alt,All}] - [--nprocs NPROCS] [--bin BIN] [--min_bq MIN_BQ] - [--min_mq MIN_MQ] [--tissue TISSUE] - [--tmp_dir TMP_DIR] - -Script to get the alleles observed in each unique cell for the variant sites - -optional arguments: - -h, --help show this help message and exit - --bam BAM Tumor bam file to be analysed - --infile INFILE Base calling file (obtained by - BaseCellCalling.step2.py), ideally only the PASS - variants - --ref REF Reference genome. *fai must be available in the same - folder as reference - --meta META Metadata with cell barcodes per cell type - --out_file OUT_FILE Out file - --alt_flag {Alt,All} Flag to search for cells carrying the expected alt - variant (Alt) or all cells independent of the alt - allele observed (All) - --nprocs NPROCS Number of processes [Default = 1] - --bin BIN Bin size for running the analysis [Default 50000] - --min_bq MIN_BQ Minimum base quality permited for the base counts. - Default = 30 - --min_mq MIN_MQ Minimum mapping quality required to analyse read. - Default = 255 - --tissue TISSUE Tissue of the sample - --tmp_dir TMP_DIR Temporary folder for tmp files -``` - -**Example:** check [here](/docs/SComaticExample.md) to see how to run this script with an example sample. - -Description of the output [here](https://github.com/cortes-ciriano-lab/SComatic/blob/main/docs/faqs.md#7-how-to-interpret-the-singlecellgenotypepy-output) - -## Computing the trinucleotide context background - -``` -python scripts/TrinucleotideBackground/TrinucleotideContextBackground.py --help -usage: TrinucleotideContextBackground.py [-h] --in_tsv IN_TSV --out_file - OUT_FILE - -Script to obtain trincucleotide context background - -optional arguments: - -h, --help show this help message and exit - --in_tsv IN_TSV File listing the tsv files to be used for the - trinucleotide context background computation (files - obtained in BaseCellCalling.step1.py) - --out_file OUT_FILE Output file - -``` - -**Example:** check [here](/docs/SComaticExample.md) to see how to run this script with an example sample. - -## Computing germline genotypes for known variants in single-cell datasets - -This script is developed to genotype known polymorphic sites in the single-cell data. It takes as input the output of step 4.1 of the SComatic tool (obtained by the BaseCellCalling.step1.py script) and produces, by default, a new TSV file (*GermlineCalls.SComatic.tsv) with the germline variant status per site and sample. If the *--ethnicity Yes* parameter is specified, it produces an extra TSV file (*GermlineCalls.SComatic.ethnicity.tsv) required for running other tools, such as scAI-SNP. - -The commands for running this tool are as follows: - -``` -python scripts/GermlineGenotyper/GermlineGenotyper.py --help -usage: scripts/GermlineGenotyper/GermlineGenotyper.py [-h] --snp SNP --tsv TSV --prefix PREFIX - [--min_reads MIN_READS] - [--min_cells MIN_CELLS] - [--min_cells_types_exp MIN_CELLS_TYPES_EXP] - [--min_cov MIN_COV] - [--min_total_cells MIN_TOTAL_CELLS] - [--ethnicity {Yes,No}] - -Script to call germline variants in a list of polymorphic sites - -optional arguments: - -h, --help show this help message and exit - --snp SNP File with SNPs to check. Format: chr:pos:ref:alt - --tsv TSV TSV file obtained by the - BaseCellCalling/BaseCellCalling.step1.py script - --prefix PREFIX Prefix for the out files. Full path and prefix - recommended - --min_reads MIN_READS - Minimum number of reads supporting the alternative - allele [Default = 3] - --min_cells MIN_CELLS - Minimum number of cells harbouring the alternative - allele [Default = 3] - --min_cells_types_exp MIN_CELLS_TYPES_EXP - Minimum number of cell types with enough expression to - get a calls [Default = 1] - --min_cov MIN_COV Minimum number of reads covering the variant site. - [Default = 5 ] - --min_total_cells MIN_TOTAL_CELLS - Minimum number of cells with at least one read - covering the variant site. [Default = 5] - --ethnicity {Yes,No} Specify if ethnicity format file should be created. - [Default: No] -``` - - diff --git a/tools/SComatic/docs/SComaticExample.md b/tools/SComatic/docs/SComaticExample.md deleted file mode 100644 index 635904e..0000000 --- a/tools/SComatic/docs/SComaticExample.md +++ /dev/null @@ -1,192 +0,0 @@ -# Example of how to run SComatic -This document shows an example of how to run SComatic in a scRNA-seq sample (10x). The provided bam file expands a small fraction of human chromosome 10 (Hg38) and harbours 5 somatic mutations (5 SNVs). The scripts used here are all available at the SComatic/scripts folder, and the example files are located in SComatic/example_data. - -## Prepare files and environment - -Activate conda environment if needed -``` -conda activate SComatic -``` - -Create an output folder and go to the main SComatic folder -``` -SCOMATIC=$PWD -output_dir=$SCOMATIC/example_data/results -mkdir -p $output_dir -``` - -If reference genome is not unpacked and indexed (.fai), you have to do it before running SComatic -``` -gunzip $SCOMATIC/example_data/chr10.fa.gz -samtools faidx $SCOMATIC/example_data/chr10.fa -``` - -## Step 1: Splitting alignment file in cell type specific bams -``` -sample=Example -output_dir1=$output_dir/Step1_BamCellTypes -mkdir -p $output_dir1 - -python $SCOMATIC/scripts/SplitBam/SplitBamCellTypes.py --bam $SCOMATIC/example_data/Example.scrnaseq.bam \ - --meta $SCOMATIC/example_data/Example.cell_barcode_annotations.tsv \ - --id ${sample} \ - --n_trim 5 \ - --max_nM 5 \ - --max_NH 1 \ - --outdir $output_dir1 -``` - -## Step 2: Collecting base count information - -``` -REF=$SCOMATIC/example_data/chr10.fa - -output_dir2=$output_dir/Step2_BaseCellCounts -mkdir -p $output_dir2 - -for bam in $(ls -d $output_dir1/*bam);do - - # Cell type - cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}') - - # Temp folder - temp=$output_dir2/temp_${cell_type} - mkdir -p $temp - - # Command line to submit to cluster - python $SCOMATIC/scripts/BaseCellCounter/BaseCellCounter.py --bam $bam \ - --ref $REF \ - --chrom all \ - --out_folder $output_dir2 \ - --min_bq 30 \ - --tmp_dir $temp \ - --nprocs 1 - - rm -rf $temp -done -``` - -In our experience, when running SComatic in an HPC cluster it is most efficient to compute the base count information for all cell types at once, specially when submitting each python line independently and at the same time to the cluster, and using multiple processors (p.e. --nprocs 16) - -## Step 3: Merging base count matrices -``` -sample=Example -output_dir3=$output_dir/Step3_BaseCellCountsMerged -mkdir -p $output_dir3 - -python $SCOMATIC/scripts/MergeCounts/MergeBaseCellCounts.py --tsv_folder ${output_dir2} \ - --outfile ${output_dir3}/${sample}.BaseCellCounts.AllCellTypes.tsv -``` - -## Step 4: Detection of somatic mutations - -#### Step 4.1 & Step 4.2 -``` -# Step 4.1 -output_dir4=$output_dir/Step4_VariantCalling -mkdir -p $output_dir4 - -sample=Example -REF=$SCOMATIC/example_data/chr10.fa - -python $SCOMATIC/scripts/BaseCellCalling/BaseCellCalling.step1.py \ - --infile ${output_dir3}/${sample}.BaseCellCounts.AllCellTypes.tsv \ - --outfile ${output_dir4}/${sample} \ - --ref $REF - -# Step 4.2 -editing=$SCOMATIC/RNAediting/AllEditingSites.hg38.txt -PON=$SCOMATIC/PoNs/PoN.scRNAseq.hg38.tsv - -python $SCOMATIC/scripts/BaseCellCalling/BaseCellCalling.step2.py \ - --infile ${output_dir4}/${sample}.calling.step1.tsv \ - --outfile ${output_dir4}/${sample} \ - --editing $editing \ - --pon $PON -``` - -As done in the datasets analysed in our study, we suggest intersecting the variants with the high quality regions of the human genome (provided in *SComatic/bed_files_of_interest/UCSC.k100_umap.without.repeatmasker.bed*). Hence, if you want to do this intersection and keep the somatic mutations that passed all filters, you can do it with a simple command: - -``` -bedtools intersect -header -a ${output_dir4}/${sample}.calling.step2.tsv -b $SCOMATIC/bed_files_of_interest/UCSC.k100_umap.without.repeatmasker.bed | awk '$1 ~ /^#/ || $6 == "PASS"' > ${output_dir4}/${sample}.calling.step2.pass.tsv -``` - -## Other SComatic functionalities - -#### Computing the number of callable sites per cell type -``` -sample=Example -output_dir5=$output_dir/CellTypeCallableSites -mkdir -p $output_dir5 - -python $SCOMATIC/scripts/GetCallableSites/GetAllCallableSites.py --infile $output_dir4/${sample}.calling.step1.tsv \ - --outfile $output_dir5/${sample} \ - --max_cov 150 --min_cell_types 2 -``` - -#### Computing the number of callable sites per cell -``` -sample=Example -REF=$SCOMATIC/example_data/chr10.fa -STEP4_1=$output_dir4/${sample}.calling.step1.tsv - -output_dir6=$output_dir/UniqueCellCallableSites -mkdir -p $output_dir6 - -for bam in $(ls -d $output_dir1/*bam);do - cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}') - echo $cell_type - - temp=$output_dir6/temp_${cell_type} - mkdir -p $temp - - python $SCOMATIC/scripts/SitesPerCell/SitesPerCell.py --bam $bam \ - --infile $output_dir4/${sample}.calling.step1.tsv \ - --ref $REF \ - --out_folder $output_dir6 --tmp_dir $temp --nprocs 1 - echo -done -``` - -#### Computing the genotype for each cell at the variant sites -``` -META=$SCOMATIC/example_data/Example.cell_barcode_annotations.tsv -sample=Example -REF=$SCOMATIC/example_data/chr10.fa -STEP4_2_pass=${output_dir4}/${sample}.calling.step2.pass.tsv - -output_dir7=$output_dir/SingleCellAlleles -mkdir -p $output_dir7 - -for bam in $(ls -d $output_dir1/*bam);do - cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}') - - temp=$output_dir7/temp_${cell_type} - mkdir -p $temp - - python $SCOMATIC/scripts/SingleCellGenotype/SingleCellGenotype.py --bam $bam \ - --infile ${STEP4_2_pass} \ - --nprocs 1 \ - --meta $META \ - --outfile ${output_dir7}/${cell_type}.single_cell_genotype.tsv \ - --tmp_dir $temp \ - --ref $REF - - rm -rf $temp -done -``` - -#### Computing the trinucleotide context background - -``` -sample=Example -output_dir8=$output_dir/TrinucleotideContext -output_dir4=$output_dir/Step4_VariantCalling # Already defined in previous steps -mkdir -p $output_dir8 - -echo ${output_dir4}/${sample}.calling.step1.tsv > ${output_dir8}/step1_files.txt - -python $SCOMATIC/scripts/TrinucleotideBackground/TrinucleotideContextBackground.py \ - --in_tsv ${output_dir8}/step1_files.txt \ - --out_file ${output_dir8}/TrinucleotideBackground.txt -``` diff --git a/tools/SComatic/docs/betabinomialestimation.md b/tools/SComatic/docs/betabinomialestimation.md deleted file mode 100644 index f34d768..0000000 --- a/tools/SComatic/docs/betabinomialestimation.md +++ /dev/null @@ -1,37 +0,0 @@ -## Estimating parameters for the beta-binomial distribution - -To estimate the Beta-binomial distribution parameters in new data, the user needs to provide (--in_tsv) a tsv (or txt) file listing the full path to all desired sample-cell-type (ideally cancer-free samples) obtained by BaseCellCounter.py. The file should look like next: - -``` -/path/to/sample1.epithelial.tsv -/path/to/sample1.T_cells.step1.tsv -/path/to/sample1.myeloid.step1.tsv -/path/to/sample2.epithelial.tsv -/path/to/sample2.T_cells.step1.tsv -/path/to/sample2.myeloid.step1.tsv -/path/to/sample3.epithelial.tsv -/path/to/sample3.T_cells.step1.tsv -/path/to/sample3.myeloid.step1.tsv -``` - -The python script takes these parameters: - - -``` -python scripts/BetaBinEstimation/BetaBinEstimation.py --help -usage: BetaBinEstimation.py [-h] --in_tsv IN_TSV --outfile OUTFILE - [--n_sites N_SITES] [--seed SEED] - -Script to estimate the Beta-binomial distribution parameters (alpha and beta) -to be used afterwards in the BaseCellCalling.step1.py - -optional arguments: - -h, --help show this help message and exit - --in_tsv IN_TSV File listing the tsv files to be used for the beta- - binomial fitting (obtained with BaseCellCounter.py - script) - --outfile OUTFILE Report with the estimated Beta-binomial parameters - --n_sites N_SITES Approximate number of sites to be used for fitting the - Beta-binomial distribution [Default: 500000] - --seed SEED Random seed for computation [Default: 1992] -``` diff --git a/tools/SComatic/docs/faqs.md b/tools/SComatic/docs/faqs.md deleted file mode 100644 index ff96540..0000000 --- a/tools/SComatic/docs/faqs.md +++ /dev/null @@ -1,137 +0,0 @@ -# FAQs - Frequently asked questions - -## 1. Are the SComatic parameters for scATAC-seq data the same as for scRNA-seq data? -No, they are different. ATAC-seq data is a DNA-based approach. Therefore, there are differences in the way of processing the bam files. These are the parameters that you should change in comparison to the scRNA-seq workflow: -- **Step 1**: remove the _--max_nM_ and _--max_NH_ parameters, set the mapping quality filter to _--min_MQ_ 30 . -- **Step 2**: set the mapping quality filter to _--min_mq_ 30 -- **Step 4.2**: Remove the _--editing_ parameter, and _--pon_ altered to point at the scATACseq PoN provided in this GitHub repo (or custom one) - -## 2. How can we perform the variant annotation with the SComatic output? -In our manuscript, we have annotated all our variants using [annovar](https://annovar.openbioinformatics.org/en/latest/). With a couple of command lines, we can adapt our variant calling output for this annotation tool. Using our example data, we should run these lines: - -**1. Preparing SComatic output for annovar** - -``` -ANNOVAR=/path/to/annovar -hummandb=/path/to/annovar_humandb_hg38 - -grep -v '#' Example.calling.step2.pass.tsv | tr '\t' '-' | awk -F'-' -v OFS='\t' '{print $1,$2,$3,$4,$5,$0}' > sample.variants.avinput -``` - -**2. Annotate variants using the annovar-ready variant file** - -``` -perl $ANNOVAR/table_annovar.pl sample.variants.avinput \ - $hummandb -buildver hg38 \ - -out sample.variants.annovar \ - -remove -protocol refGene,cytoBand,exac03,avsnp147,dbnsfp30a,gnomad_genome -operation g,r,f,f,f,f \ - -nastring . -csvout -polish --otherinfo -``` - -This step will generate a comma-separated file (.csv) called _sample.variants.annovar.hg38_multianno.csv_ , which can be easily opened and processed in your more desired software (R, excel...). The output should look like this: - -``` -Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,gnomAD_genome_ALL,gnomAD_genome_AFR,gnomAD_genome_AMR,gnomAD_genome_ASJ,gnomAD_genome_EAS,gnomAD_genome_FIN,gnomAD_genome_NFE,gnomAD_genome_OTH,Otherinfo -chr10,29559501,29559501,A,T,"intronic","SVIL",.,.,.,"10p11.23",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"chr10-29559501-29559501-A-T-PASS-Epithelial_cells-ATTCA-TTGAT-1-12-5-4-2-0.3333-0.4-0.0-0.0001-2-2-0;38;1-0;16;1-.-PASS-DP|NC|CC|BC|BQ|BCf|BCr-NA-12|5|3:0:2:0:0:0|8:0:4:0:0:0|306:0:164:0:0:0|0:0:0:0:0:0|8:0:4:0:0:0-30|13|13:0:0:0:0:0|30:0:0:0:0:0|1189:0:0:0:0:0|0:0:0:0:0:0|30:0:0:0:0:0" -chr10,73413109,73413109,G,A,"intronic","ANXA7",.,.,.,"10q22.2",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"chr10-73413109-73413109-G-A-PASS-Epithelial_cells-TGGAG-CATGG-1-12-8-4-3-0.3333-0.375-0.0-0.0-2-2-0;15;1-0;11;1-.-PASS-DP|NC|CC|BC|BQ|BCf|BCr-NA-12|8|3:0:0:5:0:0|4:0:0:8:0:0|160:0:0:302:0:0|0:0:0:0:0:0|4:0:0:8:0:0-7|6|0:0:0:6:0:0|0:0:0:7:0:0|0:0:0:270:0:0|0:0:0:0:0:0|0:0:0:7:0:0" -chr10,89413978,89413978,G,A,"upstream","IFIT5","dist=590",.,.,"10q23.31",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"chr10-89413978-89413978-G-A-PASS-Epithelial_cells-GTATG-GTGTT-1-18-11-7-4-0.3889-0.3636-0.0-0.0-2-2-0;20;1-0;12;1-.-PASS-DP|NC|CC|BC|BQ|BCf|BCr-NA-18|11|4:0:0:7:0:0|7:0:0:11:0:0|270:0:0:417:0:0|0:0:0:0:0:0|7:0:0:11:0:0-9|5|0:0:0:5:0:0|0:0:0:9:0:0|0:0:0:339:0:0|0:0:0:0:0:0|0:0:0:9:0:0" -chr10,97332670,97332670,G,A,"UTR3","FRAT2","NM_012083:c.*1201C>T",.,.,"10q24.1",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"chr10-97332670-97332670-G-A-PASS-Epithelial_cells-TTGGG-GTGGC-1-12-11-6-6-0.5-0.5455-0.0-0.0-2-2-0;12;1-0;11;1-.-PASS-DP|NC|CC|BC|BQ|BCf|BCr-6|6|0:0:0:6:0:0|0:0:0:6:0:0|0:0:0:237:0:0|0:0:0:0:0:0|0:0:0:6:0:0-12|11|6:0:0:5:0:0|6:0:0:6:0:0|229:0:0:225:0:0|0:0:0:0:0:0|6:0:0:6:0:0-NA" -chr10,126926205,126926205,G,A,"intronic","DOCK1",.,.,.,"10q26.2",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"chr10-126926205-126926205-G-A-PASS-Epithelial_cells-GATCC-GTGAC-1-35-17-15-6-0.4286-0.3529-0.0-0.0-2-2-0;35;1-0;18;1-.-PASS-DP|NC|CC|BC|BQ|BCf|BCr-NA-35|17|6:0:0:12:0:0|15:0:0:20:0:0|603:0:0:786:0:0|15:0:0:20:0:0|0:0:0:0:0:0-15|7|0:0:0:7:0:0|0:0:0:15:0:0|0:0:0:590:0:0|0:0:0:15:0:0|0:0:0:0:0:0" -``` - -As you will notice, the first few columns of the file have the annotation information provided by _annovar_ (Gene, region, impact, Gnomad allele frequencies...). Importantly, the final column of the csv file (_Otherinfo_) shows all the info found in the original _Example.calling.step2.pass.tsv_, but separated by the symbol "-" . The column names of each one of these _Otherinfo_ items are the same as the ones found in the _Example.calling.step2.pass.tsv_ : - -``` -> grep '^#CHROM' Example.calling.step2.pass.tsv - -#CHROM Start End REF ALT FILTER Cell_types Up_context Down_context N_ALT Dp Nc Bc Cc VAF CCF BCp CCp Cell_types_min_BC Cell_types_min_CC Rest_BC Rest_CC Fisher_p Cell_type_Filter INFO Myeloids Epithelial_cells Stromal_cells - -``` - -Of course, depending on the columns you want to keep for the annotation process, you might slightly change the command line in the _step 1_ described above, as well as the parameters provided in the annovar computation. The parameters provided here are the ones used in our manuscript. - -## 3. Can SComatic work with other types of PoN files? -Yes, SComatic can deal with PoNs obtained by other tools. - -The *--pon* parameter permits working with different types of formats/files depending on the availability and quantity of non-neoplastic samples. These are the main options: - -* Using the [PoNs](/PoNs/) provided in this repository, which were computed using the data described in the main manuscript of SComatic. -* Using your custom PoNs, which can be built using [SComatic modules](/docs/pon.md). -* TSV file listing the mutations detected by running SComatic (output of *Detection of somatic mutations: Step 4.2*) on scRNA-seq data from e.g., matched non-neoplastic cells. -* Custom PoN in VCF format (unzipped) generated by running a variant caller, such as *GATK-HaplotypeCaller*, on DNA sequencing (WES or WGS) data, such as a matched normal sample or a set of unmatched germline samples. -* Using a PoN in VCF format (unzipped) generated by other tools like [*GATK*](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-) - -## 4. Can we use the calls from other callers to genotype unique cells using SComatic? -Yes, SComatic allows us to use calls obtained by other callers (in vcf format) to check the presence of each mutation at single-cell resolution. However, a minor edit is required to transform the vcf format into the SComatic format. Here is the required command line to be run in the vcf: - -``` -# Prepare SComatic input from an external vcf for the script SingleCellGenotype.py -VCF=/path/to/sample.vcf -grep -v '^#' $VCF | awk -F'\t' -v OFS='\t' '{print $1,$2,$2,$4,$5,".","Cell_type",".",".",".",".",".",".","."}' > Mutations.other_caller.tsv - -SCOMATIC=/path/to/SComatic -cell_type_bam=/path/to/cell_type.bam -META=/path/to/cell_type_annotations.tsv -REF=/path/to/reference_genome.fasta -python $SCOMATIC/scripts/SingleCellGenotype/SingleCellGenotype.py --bam $cell_type_bam \ - --infile Mutations.other_caller.tsv \ - --nprocs 16 \ - --meta $META \ - --outfile Mutations.other_caller.single_cell_genotype.tsv \ - --tmp_dir temp \ - --ref $REF -``` - -## 5. How do different cell type labels (e.g. different levels of granularity) affect the SComatic performance? -SComatic can be run using different levels of granularity in terms of cell type annotations. - -As illustrated in the schematic below, in the case of non-neoplastic samples the relatedness between cell types from a development perspective determines which types of mutations can be detected. Using very granular cell type annotations that consider e.g. two cell types from the same differentiation hierarchy as different cell types, will remove any mutation present in the progenitor common to both cell types (assuming that sufficient sampling of all cell types in the data set analysed is achieved). As a result, only mutations acquired after clonal diversification will be detected (marked in green in the schematic). In contrast, mutations acquired in progenitor/stem cells and during early development will be discounted based on the fact that they will be present in all descendant cells (marked in red in the schematic). It follows from the preceding that mutations acquired very early during development will likely be considered germline based on their presence in multiple cell types when calling mutations using diverse tissue types from the same individual. By contrast, using broader cell type annotations (e.g., epithelial, immune, etc) permits the detection of mutations accumulated over longer periods of time in e.g., adult stem cells up to the point of lineage diversification. Overall, determining the granularity of the cell type annotations depends on the biological question of interest. We note that SComatic can be run using cell type annotations of variable granularity easily, which should enhance the applicability of our algorithm to diverse research areas. - -
- -
- - -For the analysis of cancer samples, we note that the effect of cell type granularity is less relevant as compared to the analysis of non-neoplastic samples, in that somatic mutations are only expected to be detected in the cancer cells. Thus, unless cell state annotations are used for the cancer cells, mutations present in cancer cells would be readily detected when running SComatic if one cell type annotation is used for all cancer cells and non-neoplastic cell types are included to remove germline polymorphisms. - -## 6. What does it happen if CellRanger does not properly trim all non-genomic sequences (adapters) from the reads? -This might be an issue and could increase the number of false positive calls in the final call set. If this is the case, you can use the parameters _--n_trim NUMER_OF_BASES_TO_IGNORE_ in _Step1_ (_SplitBamCellTypes.py_) to ignore the first and last bases of each read. In addition, when running the [_CellRanger count_](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ct) tool, provide the _–-chemistry_ parameter as precisely as possible. It will help to remove these non-desired regions. - -## 7. How to interpret the SingleCellGenotype.py output? - -This script provides base count and allele information for all cells with reads covering each variant site. By default, all cells (mutated and non-mutated) will be present in the output file (see `--alt_flag` parameter to change it). However, it is important to understand the meaning of each one of the columns of this output file to properly interpret these results. Here a short description: - -| Column | Description | -| --- | --- | -| #CHROM | Chromosome carrying the mutation | -| Start | Start genomic coordinate | -| End | End genomic coordinate | -| REF | Reference allele | -| ALT_expected | Alternative allele as described in the input file (`--infile`) | -| Cell_type_expected | Cell types harbouring the mutation as described in the input file (`--infile`) | -| Num_cells_expected | Number of expected cells carrying the mutation as described in the input file (`--infile`) | -| CB | Unique cell barcode analysed | -| Cell_type_observed | Cell type attributed to the analysed _CB_ according to the input metadata file (`--meta`) | -| Base_observed | Allele observed in this _CB_ | -| Num_reads | Number of reads carrying the _Base_observed_ | - -
- -Let's understand this table with an example. Looking at our [SComatic example data](https://github.com/cortes-ciriano-lab/SComatic/blob/main/docs/SComaticExample.md), we will focus on the variant site _chr10-29559501_ and the _SComatic/example_data/results/SingleCellAlleles/Epithelial_cells.single_cell_genotype.tsv_ file generated. - -``` -#CHROM Start End REF ALT_expected Cell_type_expected Num_cells_expected CB Cell_type_observed Base_observed Num_reads -chr10 29559501 29559501 A T Epithelial_cells 2 AGTCTTTGTGCATCTA Epithelial_cells A 5 -chr10 29559501 29559501 A T Epithelial_cells 2 CCCTCCTAGGCTAGGT Epithelial_cells A 1 -chr10 29559501 29559501 A T Epithelial_cells 2 GGGTCTGTCTTGAGGT Epithelial_cells T 2 -chr10 29559501 29559501 A T Epithelial_cells 2 GTCCTCAAGGCTCATT Epithelial_cells T 2 -chr10 29559501 29559501 A T Epithelial_cells 2 GAGTCCGAGGGTGTTG Epithelial_cells A 2 -``` - -The columns `ALT_expected`, `Cell_type_expected` and `Num_cells_expected` correspond to the values observed in the `--infile Example.calling.step2.pass.tsv`, so they represent the calls at cell type resolution. - -In contrast, the columns `CB`, `Cell_type_observed`, `Base_observed` and `Num_reads` correspond to the allele observations at unique cell resolution when interrogating the bam files. - -Each CB can be presented in the output file in as many rows as different alleles are found per cell, although in most cases, we only observed one allele per cell (so one row per unique CB). In order to find the alleles harbouring the called mutation, we have to look for those rows (unique CBs) where `ALT_expeced == Base_observed` and `Cell_type_expected == Cell_type_observed`. In general terms, CBs not accomplishing these conditions can be understood as noise or non-mutated cells. - -Although this script is designed to be run with the SComatic calls, it can also be run with an [external vcf file](https://github.com/cortes-ciriano-lab/SComatic/edit/main/docs/faqs.md#4-can-we-use-the-calls-from-other-callers-to-genotype-unique-cells-using-scomatic). As in most of the vcf-based cases we do not know the `Cell_type_expected`, we cannot check the `Cell_type_expected == Cell_type_observed` condition. However, we still can check the `ALT_expeced == Base_observed` condition. diff --git a/tools/SComatic/docs/pon.md b/tools/SComatic/docs/pon.md deleted file mode 100644 index b50fa3d..0000000 --- a/tools/SComatic/docs/pon.md +++ /dev/null @@ -1,32 +0,0 @@ -## Panel of normals (PoN) -To build your Panel of normals (PoN), the user needs to provide (--in_tsv) a tsv (or txt) file listing the full path to all the files/samples to be included and obtained by BaseCellCalling.step1.py. The file should look like next: - -``` -/path/to/sample1.basecalling.step1.tsv -/path/to/sample2.basecalling.step1.tsv -/path/to/sample3.basecalling.step1.tsv -/path/to/sample4.basecalling.step1.tsv -/path/to/sample5.basecalling.step1.tsv -``` - -The python script takes these parameters: - -``` -python scripts/PoN/PoN.py --help -usage: PoN.py [-h] --in_tsv IN_TSV --out_file OUT_FILE - [--min_samples MIN_SAMPLES] [--rm_prefix {Yes,No}] - -Script to build a SComatic Panel Of Normals (PoNs) - -optional arguments: - -h, --help show this help message and exit - --in_tsv IN_TSV File with tsv files to be used for final PoN - construction (ideally files obtained in - BaseCellCalling.step1.py) - --out_file OUT_FILE PoN output file name - --min_samples MIN_SAMPLES - Minimum number of significant samples to consider a - site in the PoN. [Default: 2] - --rm_prefix {Yes,No} Remove chr prefix from input files (Yes) or no (No) - [Default: Yes] -``` diff --git a/tools/SComatic/scripts/GermlineGenotyper/GermlineGenotyper.py b/tools/SComatic/scripts/GermlineGenotyper/GermlineGenotyper.py deleted file mode 100755 index 5612344..0000000 --- a/tools/SComatic/scripts/GermlineGenotyper/GermlineGenotyper.py +++ /dev/null @@ -1,330 +0,0 @@ -import pandas as pd -import pybedtools -import subprocess -import math -import argparse -import timeit -import os - -#---------- -# Script to extract germline information from SComatic output -#---------- - -''' -List of functions -''' -# Function to build a dictionary with the sites to check (SNPs) -def build_dict(snp, window): - DICT_germ = {} - with open(snp, 'r') as f: - for line in f: - if not line.startswith('#'): - # Coordinates and variant info - line = line.rstrip('\n') - ID=line - CHROM,POS,REF,ALT = line.split(':') - POS = int(POS) - - WIND = math.floor(POS / float(window)) - - if not CHROM in DICT_germ.keys(): - DICT_germ[CHROM] = {} - DICT_germ[CHROM][WIND] = {} - DICT_germ[CHROM][WIND][POS] = [REF,ALT] - else: - if not WIND in DICT_germ[CHROM].keys(): - DICT_germ[CHROM][WIND] = {} - DICT_germ[CHROM][WIND][POS] = [REF,ALT] - else: - DICT_germ[CHROM][WIND][POS] = [REF,ALT] - return (DICT_germ) - - -# Function to perform the germline variant calling -def GermlineCalling(out2,out3,min_reads,min_cells,min_cov,min_total_cells,min_cell_types_exp, DICT_germ,window): - outfile3= open(out3,'w') - - # Write header - HEADER = ['MUT_ID','CHROM', 'POS', 'REF', 'ALT', 'GT', 'Germline_filter', 'Cell_types', 'Cell_types_min_BC', 'Cell_types_min_CC', 'Read_depth', 'Read_count_ALT', 'VAF', 'Cell_depth', 'Cell_count_ALT', 'CCF', 'SComatic_filter'] - HEADER = "\t".join(HEADER) - outfile3.write(HEADER + '\n') - - # Run for all variants - with open(out2, 'r') as f: - for line in f: - if line.startswith('#'): - pass - #outfile.write(line) - else: - line = line.rstrip('\n') - elements = line.split('\t') - - # Important variables to consider - CHROM = elements[0] - POS = int(elements[2]) - REF = elements[3] - ALT = elements[4] - FILTER = elements[5] - Cell_types = elements[6] - N_ALT = elements[9] - Dp = elements[10] - Nc = elements[11] - Bc = elements[12] - Bc2 = Bc.replace('|',',') - Cc = elements[13] - Cc2 = Cc.replace('|',',') - VAF = elements[14] - CCF = elements[15] - Cell_types_min_BC = elements[18] - Cell_types_min_CC = elements[19] - Rest_BC = elements[20] - Rest_CC = elements[21] - - # Sum rest of bases - Rest_BCs = [float(x) for x in Rest_BC.split(';')][:-1] - Rest_ACs = int(Rest_BCs[0]) - Rest_BCs = sum(Rest_BCs) - Rest_CCs = [float(x) for x in Rest_CC.split(';')][:-1] - Rest_ACCs = int(Rest_CCs[0]) - Rest_CCs = sum(Rest_CCs) - - # Unique alternative alleles - ALTs = set(ALT.split(',')) - ALTs = ",".join(ALTs) - - # Compute window and extract the expected - WIND = math.floor(POS / float(window)) - try: - REF_snp,ALT_snp = DICT_germ[CHROM][WIND][POS] - except: - continue - MUT_ID = f"{CHROM}:{POS}:{REF_snp}:{ALT_snp}" - - # Check if there is existing alternative allele - # Filter column - New_filters = [] - if (ALT == "."): - Dps = Rest_BCs - Ncs = Rest_CCs - Bcs = Rest_ACs - Ccs = Rest_ACCs - VAFall = round(Bcs/Dps,2) - CCFall = round(Ccs/Ncs,2) - GT = "0/0" - - # Common filters - if Dps < min_cov: - New_filters.append('Low_cov') - # Total coverage (cells) - if Ncs < min_total_cells: - New_filters.append('Low_total_cells') - # Check the number of cell types carrying it - if int(Cell_types_min_BC) < min_cell_types_exp and int(Cell_types_min_CC) < min_cell_types_exp: - New_filters.append('Low_cell_type_exp') - else: - # Sum of coverage for the cell types supporting the variant - Dps = sum([int(x) for x in Dp.split(',')]) - Dps = Dps + Rest_BCs - # Sum of cells coveraged for the cell types supporting the variant - Ncs = sum([int(x) for x in Nc.split(',')]) - Ncs = Ncs + Rest_CCs - # Sum of reads with the mutations within the cell types with the variant - Bcs = sum([int(x) for x in Bc2.split(',')]) - # Sum of cells with the mutations within the cell types with the variant - Ccs = sum([int(x) for x in Cc2.split(',')]) - # N cell types - N_cell_types_mutated = len(ALT.split(',')) - # All vafs - VAFall = round(float(Bcs) / Dps,2) - # All CCF - CCFall = round(float(Ccs) / Ncs,2) - - - # Check few filters - # Total coverage (reads) - if Dps < min_cov: - New_filters.append('Low_cov') - # Total coverage (cells) - if Ncs < min_total_cells: - New_filters.append('Low_total_cells') - # Check the number of cell types carrying it - if int(Cell_types_min_BC) < min_cell_types_exp and int(Cell_types_min_CC) < min_cell_types_exp: - New_filters.append('Low_cell_type_exp') - # Min Bc and Cc >= 3 - if Bcs < min_reads: - New_filters.append('Low_read_support') - if Ccs < min_cells: - New_filters.append('Low_cell_support') - # Total VAF >= 0.1 - if (VAFall < 0.1 and CCFall < 0.1): - New_filters.append('Low_VAF_CCF') - # Check if ALT is the expected one - if ALT_snp not in ALTs: - New_filters.append('ALT_not_expected') - if int(N_ALT) > 1 or "|" in ALT: - New_filters.append('Multi_allelic') - - # Genotype - if (VAFall >= 0.1 and VAFall < 0.9): - GT = "0/1" - elif (VAFall >= 0.9): - GT = "1/1" - else: - GT = "0/0" - - # Merge filters in a single string - New_filters = ",".join(New_filters) - if (New_filters == '' and GT != "0/0"): - New_filters = "PASS" - elif (New_filters == '' and GT == "0/0"): - New_filters = "." - - # Final column - INFO = [MUT_ID,CHROM, str(POS), REF, ALTs, GT, New_filters, Cell_types, str(Cell_types_min_BC), str(Cell_types_min_CC), str(Dps), str(Bcs), str(VAFall), str(Ncs), str(Ccs), str(CCFall), FILTER] - INFO = "\t".join(INFO) - outfile3.write(INFO + '\n') - - outfile3.close() - - -def GenotypeEthnicity(GT,Germline_filter): - dict_gt = {"0/0": 0, "0/1": 0.5, "1/1": 1} - if Germline_filter == "PASS" or Germline_filter == ".": - GT_new = dict_gt[GT] - else: - GT_new = "NA" - - return GT_new - -def EthnicityFormat(out3,snp,out4): - - # Mandatory rows - SNP = pd.read_csv(snp, header = None) - SNP.columns = ['MUT_ID'] - - # Open germline calls file - OUT3 = pd.read_csv(out3, sep = '\t') - OUT3 = OUT3[['MUT_ID','GT','Germline_filter']] - - # Merge results - result = pd.merge(SNP, OUT3, on="MUT_ID", how = "left", sort = False) - - # Encode genotype for the ethnicity computation - result['GT_new'] = result.apply(lambda row : GenotypeEthnicity(row['GT'], - row['Germline_filter']), axis = 1) - - # Get the column of interest - result = result[['GT_new']] - - # Save the results - result.to_csv(out4, sep = '\t', index = False, header = False) - -''' -Arguments required for the computation of TelFusDetector -''' - -def initialize_parser(): - parser = argparse.ArgumentParser(description='Script to call germline variants in a list of polymorphic sites') - parser.add_argument('--snp', type=str, help='File with SNPs to check. Format: chr:pos:ref:alt', required = True) - parser.add_argument('--tsv', type=str, help='TSV file obtained by the BaseCellCalling/BaseCellCalling.step1.py script', required = True) - parser.add_argument('--prefix', default = '.', help='Prefix for the out files. Full path and prefix recommended', required = True) - parser.add_argument('--min_reads', type=int, default = 3, help='Minimum number of reads supporting the alternative allele [Default = 3]', required = False) - parser.add_argument('--min_cells', type=int, default = 3, help='Minimum number of cells harbouring the alternative allele [Default = 3]', required = False) - parser.add_argument('--min_cells_types_exp', type=int, default = 1, help='Minimum number of cell types with enough expression to get a calls [Default = 1]', required = False) - parser.add_argument('--min_cov', type=int, default = 5, help='Minimum number of reads covering the variant site. [Default = 5 ]', required = False) - parser.add_argument('--min_total_cells', type=int, default = 5, help='Minimum number of cells with at least one read covering the variant site. [Default = 5]', required = False) - parser.add_argument('--ethnicity', choices = ["Yes","No"], default = False, help='Specify if ethnicity format file should be created. [Default: No]', required = False) - return (parser) - - -''' -Main computation -''' -def main(): - - #------------ - # Get arguments - #------------ - - parser = initialize_parser() - args = parser.parse_args() - - snp = args.snp - tsv = args.tsv - prefix = args.prefix - min_reads = args.min_reads - min_cells = args.min_cells - min_cov = args.min_cov - min_total_cells = args.min_total_cells - min_cell_types_exp = args.min_cells_types_exp - - - #------------ - # 1. Preparing and processing input data - #------------ - - print ("1. Preparing and processing input data\n") - - # 0. Transform SNP file to BED and dictionary file - out1 = f"{prefix}.snp.bed" - - SNP = pd.read_csv(snp, sep = ':', header = None) - SNP.columns = ['CHROM','POS','REF','ALT'] - SNP['START'] = SNP['POS'] #- 1 - - BED = SNP[["CHROM", "START","POS"]] - BED.to_csv(out1, sep = '\t', index = False, header = False) - - # 1. Create a dictionary with the sites to check - window = 50000 - DICT_germ = build_dict(snp,window) - - # 2. Intersect SComatic file with desired SNPs - out2 = f"{prefix}.intersected.SComatic.tsv" - - # Command to run - command = f"bedtools intersect -header -u -a {tsv} -b {out1} > {out2}" - - # Submit linux command - try: - subprocess.run(command, shell=True) - except subprocess.CalledProcessError as error: - print(error) - - #------------ - # 2. Running germline variant calling - #------------ - - print ("2. Running germline variant calling\n") - - # Output file - out3 = f"{prefix}.GermlineCalls.SComatic.tsv" - GermlineCalling(out2,out3,min_reads,min_cells,min_cov,min_total_cells,min_cell_types_exp,DICT_germ,window) - - #------------ - # 3. Prepare the output for ethnicity - #------------ - - if (args.ethnicity == "Yes"): - print ("3. Preparing calls for the ethnicity prediction tool\n") - - # Output file - out4 = f"{prefix}.GermlineCalls.SComatic.ethnicity.tsv" - EthnicityFormat(out3,snp,out4) - - #------------ - # Removing temp files - #------------ - os.remove(out1) - os.remove(out2) - -#--------------- -# Running the script -#--------------- - -if __name__ == '__main__': - start = timeit.default_timer() - main() - stop = timeit.default_timer() - Seconds = round(stop - start) - print(f"Computation time: {Seconds} seconds\n") diff --git a/tools/SComatic/scripts/GetCallableSites/GetAllCallableSites.py b/tools/SComatic/scripts/GetCallableSites/GetAllCallableSites.py deleted file mode 100644 index d898663..0000000 --- a/tools/SComatic/scripts/GetCallableSites/GetAllCallableSites.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/python - -import numpy as np -import timeit -import os -import math -import pysam -import argparse -from scipy.stats import betabinom -import scipy.stats as stats -import pandas as pd - -def reinit_template(diff_cell_types,min_reads,min_cells): - # Creates empty dicctionary for coverage counts - TEMPLATE = {min_reads:0, min_cells:0, 5: 0, 10: 0, 20: 0,30: 0} - INFORMATIVE_POSITIONS_TEMPLATE = {x:{'NC':TEMPLATE.copy(),'DP':TEMPLATE.copy()} for x in diff_cell_types} - return(INFORMATIVE_POSITIONS_TEMPLATE, TEMPLATE) - -def callable_sites(file,min_cell_types,MAX): - Alleles = ["A","C","T","G","I","D","N","O"] - VARIANT_POSITIONS = {} - CHR_INFORMATIVE_POSITIONS = {} - - cur_chr = 'z' - - counter = 0 - with open(file, 'r') as f: - for line in f: - if line.startswith('##'): - pass - - else: - if line.startswith('#CHROM') and counter == 0: - line = line.rstrip('\n') - - # Create dictionary with cell types and indexes for each one - elements = line.split('\t') - cell_types_idx = {x : elements[x] for x in range(len(elements)) if x > 24} # First 4 columns are common across all lines ['#CHROM', 'Start','End', 'REF', 'INFO']. Up to the 25th, they are lines that are created during the first step of the variant calling - - diff_cell_types = list(cell_types_idx.values()) - - for cell_type in diff_cell_types: - CHR_INFORMATIVE_POSITIONS[cell_type] = {} - - else: - counter = counter + 1 - line = line.rstrip('\n') - - # Create dictionary with cell types and indexes for each one - elements = line.split('\t') - - # Chromosome - CHROM = str(elements[0]) - if (CHROM != cur_chr): - for cell_type in CHR_INFORMATIVE_POSITIONS.keys(): # Create templates (setting values to 0 for the counting) - CHR_INFORMATIVE_POSITIONS[cell_type][CHROM] = {} - CHR_INFORMATIVE_POSITIONS[cell_type][CHROM] = {x:{'DP':0,'NC':0} for x in range(0,MAX+1)} - cur_chr = CHROM - - # Position - POS = int(elements[1]) - - # Reference allele - REF = elements[3] - - # Number of cell types with enough coverage - Cell_types_min_DP = int(elements[18]) - # Number of cell types with enough coverage - Cell_types_min_NC = int(elements[19]) - - - # Filter by the minimum number of cell types with enough coverage and number of cells - if (Cell_types_min_DP >= min_cell_types and Cell_types_min_NC >= min_cell_types): - - ### Site variant calling level info - FILTER = [] - - # Computation for each cell type - for cell_type_i in cell_types_idx.keys(): - cell_type = cell_types_idx[cell_type_i] - INFO_i = elements[cell_type_i] - - - if not INFO_i.startswith('NA'): - DP,NC,CC,BC,BQ,BCf,BCr = INFO_i.split('|') - - # Set a maximum to be stored in the callable table (dictionary so far) - if (int(DP) > MAX): - DP = MAX - else: - DP = int(DP) - if (int(NC) > MAX): - NC = MAX - else: - NC = int(NC) - - # Add values to dictionary - CHR_INFORMATIVE_POSITIONS[cell_type][CHROM][DP]['DP'] += 1 - CHR_INFORMATIVE_POSITIONS[cell_type][CHROM][NC]['NC'] += 1 - - return(CHR_INFORMATIVE_POSITIONS) - - - -def initialize_parser(): - parser = argparse.ArgumentParser(description='Script to calculate the number of callable sites per cell type') - parser.add_argument('--infile', type=str, help='Tsv file obtained by BaseCellCalling.step1.py to be analysed', required = True) - parser.add_argument('--outfile', type=str, help='Out file prefix', required = True) - parser.add_argument('--max_cov', type=int, default = 150, help='Maximum coverage to record in the callable sites table. Greater values will be collapsed to the provided one. [Default: 150]', required = False) - parser.add_argument('--min_cell_types', type=int, default = 2, help='Minimum number of cell types with enough coverage/cells at a site to be considered as a callable [Default: 2]', required = False) - return (parser) - -def main(): - - # 1. Arguments - parser = initialize_parser() - args = parser.parse_args() - - infile = args.infile - outfile = args.outfile - max_cov = args.max_cov - min_cell_types = args.min_cell_types - - # 1. Variant calling - print ('\n------------------------------') - print ('Getting callable sites') - print ('------------------------------\n') - - # Getting callable sites - start = timeit.default_timer() - informative_positions = callable_sites(infile,min_cell_types,max_cov) - - # Coverage report - outfile_report_all = outfile + '.coverage_cell_count.report.tsv' - outfile_report_all_chrom = outfile + '.coverage_cell_count.per_chromosome.report.tsv' - - - # Coverage per chromosome - out1 = open(outfile_report_all_chrom,'w') - Header1=['Cell_types','CHROM','Cov', 'DP','NC'] - out1.write('\t'.join(Header1)+'\n') - for cell_type in informative_positions.keys(): - for chrom in informative_positions[cell_type].keys(): - for COV in informative_positions[cell_type][chrom]: - DP = informative_positions[cell_type][chrom][COV]['DP'] - NC = informative_positions[cell_type][chrom][COV]['NC'] - LINE = [str(cell_type),str(chrom),str(COV),str(DP),str(NC)] - LINE = '\t'.join(LINE) - out1.write(LINE+'\n') - out1.close() - - - # Coverage all chromosomes collapsed - DF = pd.read_csv(outfile_report_all_chrom, delimiter = '\t') - DFtotal = DF.groupby(['Cell_types','Cov']).agg(DP = ('DP',sum), NC = ('NC',sum)).reset_index() - DFtotal.to_csv(outfile_report_all, index=False, sep = '\t') - - -#------------------------- -# Running scRNA somatic variant calling -#------------------------- -if __name__ == '__main__': - start = timeit.default_timer() - main() - stop = timeit.default_timer() - print ('\nTotal computing time: ' + str(round(stop - start,2)) + ' seconds') diff --git a/tools/SComatic/scripts/HighConfidenceCancerVariants/HighConfidenceCancerVariants.py b/tools/SComatic/scripts/HighConfidenceCancerVariants/HighConfidenceCancerVariants.py deleted file mode 100644 index 5b2fe1b..0000000 --- a/tools/SComatic/scripts/HighConfidenceCancerVariants/HighConfidenceCancerVariants.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/python - -import timeit -import argparse -import pandas as pd - -def variant_calling_step3(file,outfile,filtered_outfile,min_dp,deltaVAF,deltaCCF,cancer,noncancer): - - #--- - # Command to focus only on high confidence cancer variants (HCCV) - #--- - - # Save comment lines - f2 = open(outfile,'w') - f3 = open(filtered_outfile,'w') - with open(file, 'r') as f: - for line in f: - if line.startswith('#'): - # Extract output file column names - if '#CHROM' in line: - line = line.rstrip('\n') - output_column_names = line.split('\t') - else: - f2.write(line) - f3.write(line) - else: - break - f2.write('##INFO=HCCV_FILTER,Description=Filter status of the variant site for cell reannotation (high-confidence cancer variants)\n') - f3.write('##INFO=HCCV_FILTER,Description=Filter status of the variant site for cell reannotation (high-confidence cancer variants)\n') - f2.close() - f3.close() - - input_df = pd.read_csv(file, sep='\t',comment='#',names=output_column_names) - - #Filtering homopolymeres, GnomAD/RNA editing/PON/clustered sites, - # multiallelic sites and sites with unsufficient # celltypes covered - input_df = input_df[~input_df['FILTER'].str.contains('Min|LR|gnomAD|LC|RNA|llel', regex=True)] - - #Filtering short-read PON, except in chrM - input_df = input_df[~((input_df['#CHROM']!='chrM') & (input_df['#CHROM']!='MT') & (input_df['FILTER'].str.contains('SR')))] - - #Filtering loci passing all filters in cancer cells - input_df = input_df[input_df['Cell_type_Filter'].str.contains('PASS')] - - #Delta VAF and CCF filtering - input_df['HCCV_FILTER'] = input_df.apply(lambda x: - HCCV_filtering(x['Cell_types'],x['Dp'],x['VAF'], x['CCF'],min_dp,deltaVAF,deltaCCF,cancer,x[noncancer]), axis=1) - - filtered_df = input_df.copy(deep=True)[input_df['HCCV_FILTER'] == 'PASS'] - - # Keeping only 1 position for clustered positions - filtered_df['INDEX'] = filtered_df['#CHROM'] + ':' + filtered_df['Start'].astype(str) + ':' + filtered_df['ALT'] - idx = filtered_df['INDEX'] - a=[] - for i in idx: - chr,pos,base=i.split(':') - a.append((chr,pos,base)) - - b = sorted(a, key=lambda x: (x[0],x[1])) - - keep,trash=[],[] - region = [0,0] - for (chr,pos,base), (chr2,pos2,base2) in zip(b, b[1:]): - if chr==chr2: - if chr == 'chrM' or chr == 'MT': - keep.append(':'.join([chr,pos,base])) - keep.append(':'.join([chr2,pos2,base2])) - elif abs(int(pos)-int(pos2))<10000: - print(region[1],int(pos)) - if region[1]==int(pos): - region = [int(pos),int(pos2)] - trash.append(':'.join([chr,pos,base])) - trash.append(':'.join([chr2,pos2,base2])) - else: - keep.append(':'.join([chr,pos,base])) - trash.append(':'.join([chr2,pos2,base2])) - region = [int(pos),int(pos2)] - else: - keep.append(':'.join([chr,pos,base])) - else: - keep.append(':'.join([chr,pos,base])) - keep.append(':'.join([chr2,pos2,base2])) - keep = [ i for i in keep if i not in trash] - filtered_df = filtered_df[filtered_df['INDEX'].isin(keep)] - - # Write outputs - input_df.to_csv(outfile, sep='\t', index=False, mode='a') - filtered_df.to_csv(filtered_outfile, sep='\t', index=False, mode='a') - -def HCCV_filtering(CTYPES,DP,VAF,CCF,min_dp,deltaVAFmin,deltaCCFmin, cancer, NonCancerInfo): - ctypes = CTYPES.split(',') - #If only 1 cell type is called - if len(ctypes)==1 and ctypes[0]==cancer: - if float(VAF) >= 0.1 and float(CCF) >= 0.2: - DP2 = NonCancerInfo.split('|')[0] - if int(DP)1: - DP1,DP2=DP.split(',') - #Only look at higher depth loci - if int(DP1)0.2: - return 'HeterozygSite' - # HCCF are variants with high VAF in cancer and low VAF in non-cancer cells - elif deltaCCF < deltaCCFmin: - return 'LowDeltaCCF' - else: - return 'PASS' - #Same as above, but with inversed cell types. - else: - if VAF1>0.2: - return 'HeterozygSite' - elif -deltaCCF < deltaCCFmin: - return 'LowDeltaCCF' - else: - return 'PASS' - else: - return 'NonCancer' - -def initialize_parser(): - parser = argparse.ArgumentParser(description='Script to perform the scRNA somatic variant calling') - parser.add_argument('--infile', type=str, help='Input file with all samples merged in a single tsv', required = True) - parser.add_argument('--outfile', type=str, help='Out file prefix', required = True) - parser.add_argument('--min_dp', type=float, default = 20, help='Minimum depth in both celltypes to call a HCCV', required = True) - parser.add_argument('--deltaVAF', type=float, default = 0.3, help='Delta VAF between cancer and non-cancer cells', required = True) - parser.add_argument('--deltaCCF', type=float, default = 0.3, help='Delta CCF (cancer cell fraction) between cancer and non-cancer cells', required = True) - parser.add_argument('--cancer_ctype', type=str, default = '', help='Name of cancer cell type in meta file', required = False) - parser.add_argument('--noncancer_ctype', type=str, default = '', help='Name of noncancer cell type in meta file', required = False) - return (parser) - -def main(): - - # 1. Arguments - parser = initialize_parser() - args = parser.parse_args() - - infile = args.infile - outfile = args.outfile - min_dp = args.min_dp - deltaVAF = args.deltaVAF - deltaCCF = args.deltaCCF - cancer_ctype = args.cancer_ctype - noncancer_ctype = args.noncancer_ctype - - # 1.2: Step 2: Add distance, editing and PoN filters - print ('\n- High Confidence Cancer Variants calling\n') - outfile3 = outfile + '.HCCV.unfiltered.tsv' - filtered_outfile = outfile + '.HCCV.tsv' - variant_calling_step3(infile,outfile3,filtered_outfile,min_dp,deltaVAF,deltaCCF,cancer_ctype,noncancer_ctype) - -#------------------------- -# Running scRNA somatic variant calling -#------------------------- -if __name__ == '__main__': - start = timeit.default_timer() - main() - stop = timeit.default_timer() - print ('\nTotal computing time: ' + str(round(stop - start,2)) + ' seconds') - - diff --git a/tools/SComatic/scripts/SingleCellGenotype/SingleCellGenotypeNoFusion.py b/tools/SComatic/scripts/SingleCellGenotype/SingleCellGenotypeNoFusion.py deleted file mode 100644 index 68052a3..0000000 --- a/tools/SComatic/scripts/SingleCellGenotype/SingleCellGenotypeNoFusion.py +++ /dev/null @@ -1,451 +0,0 @@ -import pysam -import timeit -import multiprocessing as mp -import argparse -import pandas as pd -import glob -import os -import math -import numpy as np -from natsort import natsorted -from scipy.stats import betabinom - -def collect_result(results): - pass - -def BaseCount(LIST, REF_BASE): - Bases=['A','C','T','G','N'] - - # Dictinary with base counts - NUCLEOTIDES = { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'I': 0, 'D' : 0, 'N': 0, 'R' : 0, 'Other' : 0} - - # Update dictionary counts - for x in LIST: - if x.upper() in Bases: - NUCLEOTIDES[x.upper()] += 1 - elif '-' in x: - NUCLEOTIDES['D'] += 1 - elif '+' in x: - NUCLEOTIDES['I'] += 1 - else: - NUCLEOTIDES['Other'] += 1 - - # Calculate Alternative count of the most frequent alternative allele - Alternative = ['A','C','T','G','I', 'D'] - - ALTERNATIVE = {k:v for k,v in NUCLEOTIDES.items() if k != REF_BASE.upper() and k in Alternative} - - AC = max(ALTERNATIVE.items(), key=lambda x: x[1])[1] - - if (AC > 0): - listOfKeys = list() - # Iterate over all the items in dictionary to find keys with max value - for key, value in ALTERNATIVE.items(): - if value == AC: - listOfKeys.append(key) - - MAX_ALT = ','.join(sorted(listOfKeys)) - else: - MAX_ALT = '.' - - return ([NUCLEOTIDES,MAX_ALT,AC]) - -#@profile -def EasyReadPileup(LIST, REF_BASE): - Bases = set(['A','C','T','G','N']) - - AC = 0 - - # Create a new list based on pileup reading - NEW_LIST = [] - for x in LIST: - LEN = len(x) - UPPER = x.upper() - if UPPER in Bases: - NEW_LIST.append(UPPER) - # Check for Alt counts - if (UPPER != REF_BASE): - AC = AC + 1 - elif LEN > 1 and x[1] == '-': - D = 'D' - NEW_LIST.append(D) - AC = AC + 1 - elif LEN > 1 and x[1] == '+': - I = 'I' - NEW_LIST.append(I) - AC = AC + 1 - elif x == '*': - NEW_LIST.append('O') - else: - NEW_LIST.append('NA') - - return (NEW_LIST,AC) - -def run_interval(code,var_dict,meta_dict,BAM,FASTA,tmp_dir,BQ,MQ,ALT_FLAG,alpha2,beta2,pval,chrm_conta): - # List of positions - interval = var_dict[code] - - # Create a dictionary with the positions to analyse - CHROM = [] - Target_sites = {} - for group in interval: - group = group.split("\t") - - # Chrom - # Extract as well info from windows - CHROM.append(group[0]) - - # Position - # Consider that pysam uses 0-based coordinates (REAL_POS - 1) - pos_temp = int(group[1]) - pos_temp = pos_temp - 1 - - # Fill sub-dictionary with variant information - REF = group[3] - ALT = group[4].split(',')[0] - CELL_TYPE_EXPECTED = 'OSEF' - NUM_CELLS_WITH_MUTATION_EXPECTED = 0 - - Target_sites[pos_temp] = [REF,ALT,CELL_TYPE_EXPECTED,NUM_CELLS_WITH_MUTATION_EXPECTED] - - # Coordinates to analyse - List_of_target_sites = set(Target_sites.keys()) - CHROM = CHROM[0] - START = min(List_of_target_sites) - 1 - END = max(List_of_target_sites) + 1 - if CHROM == 'chrM': - print(START) - # Temporary out file - ID = '_'.join([str(CHROM), str(min(List_of_target_sites)), str(max(List_of_target_sites))]) - out_temp = tmp_dir + '/' + ID + '.SingleCellCounts.temp' - outfile = open(out_temp,'w') - - # Get pileup read counts from coordinates - bam = pysam.AlignmentFile(BAM) - i = bam.pileup(CHROM, START, END, min_base_quality = BQ, min_mapping_quality = MQ, ignore_overlaps = False, max_depth = 200000) - - # Load reference file. Mandatory to be done inside function to avoid overlap problems during multiprocessing - inFasta = pysam.FastaFile(FASTA) - - #Cell were the barcode is found - CELLS = {pos:{barcode:{'Dp':0,'Alt':0} for barcode in meta_dict.keys()} for pos in List_of_target_sites } - - # Run it for each position in pileup - for p in i: - POS=p.pos - if POS in List_of_target_sites: - # Expected values. Variants to check - Ref_exp,Alt_exp,Cell_type_exp,Num_cells_exp = Target_sites[POS] - - # Get reference base from fasta file - ref_base = inFasta.fetch(CHROM, POS, POS+1) - - # Get pileup info - READS = p.get_query_names() - QUALITIES = p. get_query_qualities() - PILEUP_LIST = p.get_query_sequences(mark_matches=True, add_indels=True) - NEW_PILEUP_LIST,AC = EasyReadPileup(PILEUP_LIST, ref_base) - - # Expected bases - if (ALT_FLAG == 'All'): - Bases = set(['A','C','T','G','I','D','N']) - idx_reads = [x for x in range(0,len(NEW_PILEUP_LIST)) if NEW_PILEUP_LIST[x] in Bases] - else: - idx_reads = [x for x in range(0,len(NEW_PILEUP_LIST)) if NEW_PILEUP_LIST[x] == Alt_exp] - - # Get reads info - reads = p.pileups - - for read_i in idx_reads: - - read_alignment = reads[read_i].alignment - - # Alt observed in the read - Alt_obs = NEW_PILEUP_LIST[read_i] - try: - barcode = read_alignment.opt("CB") - # Fix barcode - barcode = barcode.split("-")[0] - ctype = meta_dict[barcode] - except: - continue - - # Remove low quality reads: Seconday alignments, duplicate reads, supplementary alignments - if read_alignment.is_secondary == False and read_alignment.is_duplicate == False and read_alignment.is_supplementary == False: - #compute depth and #alt reads per cell - if str(Alt_obs) == str(Alt_exp): - CELLS[POS][barcode]['Dp']+=1 - CELLS[POS][barcode]['Alt']+=1 - else: - CELLS[POS][barcode]['Dp']+=1 - #Compute VAF and establish if the locus is mutated in each cell with coverage - for POS in CELLS.keys(): - for bc in CELLS[POS].keys(): - try: - CTYPE = meta_dict[bc] - except: - continue - #Initialize values - DP = CELLS[POS][bc]['Dp'] - ALT = CELLS[POS][bc]['Alt'] - VAF = '.' - BETABIN = '.' - MUTATED = 'NoCoverage' - if DP > 0: - VAF = round(ALT/DP,4) - if ALT > 0: - #Special case for chrM due to contaminants: - if chrm_conta == 'True' and str(CHROM) == 'chrM': - if VAF < 0.3: - MUTATED = 'LowVAFChrM' - else: - MUTATED = 'PASS' - #All non-chrM chromosomes - else: - BETABIN = round(betabinom.sf(ALT-0.001, DP, alpha2, beta2),4) - #test if the betabin p-value is significant - if BETABIN < pval: - MUTATED = 'PASS' - else: - MUTATED = 'BetaBin_problem' - else: - MUTATED = 'NoAltReads' - - if MUTATED == "PASS": - BINARIZED = 1 - elif MUTATED == "NoCoverage": - BINARIZED = 3 - else: - BINARIZED = 0 - - Ref_exp,Alt_exp,Cell_type_exp,Num_cells_exp = Target_sites[POS] - INDEX = str(CHROM) + ':' + str(POS+1) + ':' + Alt_exp.split(',')[0] - Group = [str(CHROM), str(POS+1),str(POS+1),Ref_exp,Alt_exp,str(Cell_type_exp),str(Num_cells_exp),bc,CTYPE,str(DP),str(ALT),str(VAF),str(BETABIN),str(MUTATED),str(BINARIZED),INDEX] - Group = '\t'.join(Group) + '\n' - outfile.write(Group) - - inFasta.close() - bam.close() - outfile.close() - -def meta_to_dict(meta_file,tissue): - metadata = pd.read_csv(meta_file, delimiter = "\t") - metadata['Index'] = [i.split("-")[0] for i in list(metadata['Index'])] - # Clean index column - metadata['Index_clean'] = metadata['Index'].str.replace('-.*$','',regex=True) - - # If tissue provided, append tissue id to cell type to be recognised in downstream analysis - if tissue == None: - metadata['Cell_type_clean'] = metadata['Cell_type'].str.replace(' ','_',regex=True) - else: - tissue = tissue.replace(" ", "_") - metadata['Cell_type_clean'] = metadata['Cell_type'].str.replace(' ','_',regex=True) - metadata['Cell_type_clean'] = str(tissue) + '__' + metadata['Cell_type_clean'].astype(str) - - # Create dicitionary with cell types and cell barcodes - DICT = metadata.set_index('Index_clean')['Cell_type_clean'].to_dict() - ALL_CELL_TYPES = metadata['Cell_type_clean'].unique() - - del metadata - - return(DICT, ALL_CELL_TYPES) - - -def build_dict_variants(variant_file,window): - DICT_variants = {} - with open(variant_file, 'r') as f: - for line in f: - if not line.startswith('#') and not line.startswith('Chr'): - line = line.rstrip('\n') - elements = line.split('\t') - - # Chromosome - CHROM = elements[0] - - # Position - POS = int(elements[1]) - WIND = math.floor(POS / float(window)) - - CODE = CHROM + '_' + str(WIND) - if not CODE in DICT_variants.keys(): - DICT_variants[CODE] = [line] - else: - DICT_variants[CODE].append(line) - return (DICT_variants) - - -def concatenate_sort_temp_files_and_write(out_prefix, tmp_dir): - # Get the file paths - all_files = glob.glob(tmp_dir + '/*.SingleCellCounts.temp') - - # Load as panda files - if (len(all_files) > 0): - - ## Organize files in dictionaries of chromosomes and starts - Dictionary_of_files = {} - for filename in all_files: - basename = os.path.basename(filename) - - # It can consider not standard chromosome nomenglatures (with dots) - coordinates = basename.replace('.SingleCellCounts.temp','') - - CHROM, START, END = coordinates.rsplit("_",2) - - START = int(START) - - if (CHROM not in Dictionary_of_files): - Dictionary_of_files[CHROM] = {} - Dictionary_of_files[CHROM][START] = filename - else: - Dictionary_of_files[CHROM][START] = filename - - - ## Write in the final output file - out = open(out_prefix+'.SingleCellGenotype.tsv','w') - Header=['#CHROM','Start', 'End', 'REF', 'ALT_expected', 'Cell_type_expected','Num_cells_expected','CB','Cell_type_observed','Dp','ALT','VAF','BetaBin','MutationStatus','BinMutationStatus','INDEX'] - out.write('\t'.join(Header)+'\n') - - # Move thrugh filenanes by sorted coordinates - for chrom in sorted(Dictionary_of_files.keys()): - for start in sorted(Dictionary_of_files[chrom].keys()): - filename = Dictionary_of_files[chrom][start] - - with open(filename, 'r') as f: - out.write(f.read()) - - # Remove temp file - os.remove(filename) - out.close() - - - else: - # If temp files not found, print message - print ('No temporary files found') - - -def sort_chr_index(df): - renamed = [i.replace('chrM','chrZ') for i in df.index] - df.index = renamed - df = df.reindex(natsorted(df.index)) - renamed = [i.replace('chrZ','chrM').replace('zzz:','') for i in df.index] - df.index = renamed - return df - - -def pivot_long_dataframe(out_prefix): - #Pivot long SingleCellGenotype file into wide cell-SNV matrixes - long_df = pd.read_csv(out_prefix + '.SingleCellGenotype.tsv', sep = '\t') - - # Cell-SNV matrix with reads depth as values - Dp_df = long_df.pivot(index='INDEX', columns='CB', values='Dp') - Dp_df = sort_chr_index(Dp_df) - Dp_df.to_csv(out_prefix + '.DpMatrix.tsv', sep='\t', index=True) - - # Cell-SNV matrix with raw ALT reads counts as values - Alt_df = long_df.pivot(index='INDEX', columns='CB', values='ALT') - Alt_df = sort_chr_index(Alt_df) - Alt_df.to_csv(out_prefix + '.AltMatrix.tsv', sep='\t', index=True) - - # Cell-SNV matrix with VAFs as values - VAF_df = long_df.pivot(index='INDEX', columns='CB', values='VAF') - VAF_df = sort_chr_index(VAF_df) - VAF_df.to_csv(out_prefix + '.VAFMatrix.tsv', sep='\t', index=True) - - # Cell-SNV matrix with binarized mutation status - # (1: mutated, 0: non-mutated, 3: no coverage) - Bin_df = long_df.pivot(index='INDEX', columns='CB', values='BinMutationStatus') - Bin_df = sort_chr_index(Bin_df) - Bin_df.to_csv(out_prefix + '.BinaryMatrix.tsv', sep='\t', index=True) - -def initialize_parser(): - parser = argparse.ArgumentParser(description='Script to get the alleles observed in each unique cell for the variant sites') - parser.add_argument('--bam', type=str, default=1, help='Tumor bam file to be analysed', required = True) - parser.add_argument('--infile', type=str, default=1, help='Base calling file (obtained by BaseCellCalling.step2.py), ideally only the PASS variants', required = True) - parser.add_argument('--ref', type=str, default=1, help='Reference genome. *fai must be available in the same folder as reference', required = True) - parser.add_argument('--meta', type=str, default=1, help='Metadata with cell barcodes per cell type', required = True) - parser.add_argument('--outfile', default = 'Matrix.tsv', help='Out file', required = False) - parser.add_argument('--alt_flag', default = 'All',choices = ['Alt','All'], help='Flag to search for cells carrying the expected alt variant (Alt) or all cells independent of the alt allele observed (All)', required = False) - parser.add_argument('--nprocs',default = 1, help='Number of processes [Default = 1]',required=False,type = int) - parser.add_argument('--bin', type=int, default=50000, help='Bin size for running the analysis [Default 50000]', required = False) - parser.add_argument('--min_bq', type=int, default = 30, help='Minimum base quality permited for the base counts. Default = 30', required = False) - parser.add_argument('--min_mq', type=int, default = 255, help='Minimum mapping quality required to analyse read. Default = 255', required = False) - parser.add_argument('--tissue', type=str, default=None, help='Tissue of the sample', required = False) - parser.add_argument('--tmp_dir', type=str, default = 'tmpDir', help='Temporary folder for tmp files', required = False) - parser.add_argument('--alpha2', type=float, default = 0.260288007167716, help='Alpha parameter for Beta-binomial distribution of read counts. [Default: 0.260288007167716]', required = False) - parser.add_argument('--beta2', type=float, default = 173.94711910763732, help='Beta parameter for Beta-binomial distribution of read counts. [Default: 173.94711910763732]', required = False) - parser.add_argument('--pvalue', type=float, default = 0.01, help='P-value for the beta-binomial test to be significant', required = False) - parser.add_argument('--chrM_contaminant', type=str, default = 'True', help='Use this option if chrM contaminants are observed in non-cancer cells', required = False) - return (parser) - -def main(): - - # 0. Arguments - parser = initialize_parser() - args = parser.parse_args() - - BAM = args.bam - FASTA = args.ref - CORE = args.nprocs - out_prefix = args.outfile - window = args.bin - tmp_dir = args.tmp_dir - meta_file = args.meta - variant_file = args.infile - tissue = args.tissue - BQ = args.min_bq - MQ = args.min_mq - ALT_FLAG = args.alt_flag - alpha2 = args.alpha2 - beta2 = args.beta2 - pval = args.pvalue - chrM_conta = args.chrM_contaminant - - # Set outfile name - print("Outfile prefix: " , out_prefix , "\n") - - # 1. Check if temp dir is created - if (tmp_dir != '.'): - try: - # Create target Directory - os.mkdir(tmp_dir) - print("Directory " , tmp_dir , " created\n") - except FileExistsError: - print("Directory " , tmp_dir , " already exists\n") - else: - print("Not temp directory specified, using working directory as temp") - - # 2. Create dictionaries with variants and cell barcodes - META_DICT, ALL_CELL_TYPES = meta_to_dict(meta_file,tissue) - DICT_VARIANTS = build_dict_variants(variant_file,window) - - # 3. Code to run in parallel all bins - if (CORE > 1): - pool = mp.Pool(CORE) - - # Step 3.1: Use loop to parallelize - for row in DICT_VARIANTS.keys(): - # This funtion writes in temp files the results - pool.apply_async(run_interval, args=(row,DICT_VARIANTS,META_DICT,BAM,FASTA,tmp_dir,BQ,MQ,ALT_FLAG,alpha2,beta2,pval,chrM_conta), callback=collect_result) - - # Step 3.2: Close Pool and let all the processes complete - pool.close() - pool.join() - else: - for row in DICT_VARIANTS.keys(): - # This funtion writes in temp files the results - collect_result(run_interval(row,DICT_VARIANTS,META_DICT,BAM,FASTA,tmp_dir,BQ,MQ,ALT_FLAG,alpha2,beta2,pval,chrM_conta)) - - # 4. Write "Long" file - concatenate_sort_temp_files_and_write(out_prefix, tmp_dir) - - # 5. Write matrixes ("wide") DP/ALT/VAF/Binarized files - pivot_long_dataframe(out_prefix) - -if __name__ == '__main__': - start = timeit.default_timer() - main() - stop = timeit.default_timer() - print ("Computation time: " + str(round(stop - start)) + ' seconds') - - - diff --git a/tools/SComatic/scripts/SitesPerCell/SitesPerCell.py b/tools/SComatic/scripts/SitesPerCell/SitesPerCell.py deleted file mode 100644 index 3161e66..0000000 --- a/tools/SComatic/scripts/SitesPerCell/SitesPerCell.py +++ /dev/null @@ -1,373 +0,0 @@ -import pysam -import timeit -import pybedtools -import multiprocessing as mp -import argparse -import pandas as pd -import glob -import os -import sys -import time -import subprocess -import shutil -import numpy as np -from collections import Counter - -def collect_result(result): - if (result[1] != ''): - VARIANTS = result[1] - OUT = result[0] - np.save(OUT,VARIANTS) - -def concatenate_sort_temp_files_and_write(out_file, tmp_dir, ID): - # Get the file paths - all_files = glob.glob(tmp_dir + '/*.SitesPerCell.npy') - pattern=tmp_dir + '/*.SitesPerCell.npy' - - final_dict = dict() - for npy in all_files: - a = np.load(npy,allow_pickle=True) - b = a.item() - - # Collapse and sum all cell counts - for cb in b.keys(): - c = b[cb] - final_dict[cb] = final_dict.get(cb, 0) + c - - # Remove file - os.remove(npy) - - # Transform to data frame - df = pd.DataFrame.from_dict(final_dict, orient='index') - df.index.name = 'CB' - df.reset_index(inplace=True) - df.rename({0: 'SitesPerCell'}, axis=1, inplace=True) - df.to_csv(out_file, index=False) - - -def MakeWindows(bed, window): - ## Makewindows in bed file based on the window sizes specified as argument - a = pybedtools.BedTool(bed) - final_bed = a.window_maker(a,w=window) - - return(final_bed) - -def BaseCount(LIST, REF_BASE): - Bases=['A','C','T','G','N'] - - # Dictinary with base counts - NUCLEOTIDES = { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'I': 0, 'D' : 0, 'N': 0, 'R' : 0, 'Other' : 0} - - # Update dictionary counts - for x in LIST: - if x.upper() in Bases: - NUCLEOTIDES[x.upper()] += 1 - elif '-' in x: - NUCLEOTIDES['D'] += 1 - elif '+' in x: - NUCLEOTIDES['I'] += 1 - else: - NUCLEOTIDES['Other'] += 1 - - # Calculate Alternative count of the most frequent alternative allele - Alternative = ['A','C','T','G','I', 'D'] - - ALTERNATIVE = {k:v for k,v in NUCLEOTIDES.items() if k != REF_BASE.upper() and k in Alternative} - - AC = max(ALTERNATIVE.items(), key=lambda x: x[1])[1] - - if (AC > 0): - listOfKeys = list() - # Iterate over all the items in dictionary to find keys with max value - for key, value in ALTERNATIVE.items(): - if value == AC: - listOfKeys.append(key) - - MAX_ALT = ','.join(sorted(listOfKeys)) - else: - MAX_ALT = '.' - - return ([NUCLEOTIDES,MAX_ALT,AC]) - -def EasyReadPileup(LIST, REF_BASE): - Bases = set(['A','C','T','G','N']) - - AC = 0 - - # Create a new list based on pileup reading - NEW_LIST = [] - for x in LIST: - LEN = len(x) - UPPER = x.upper() - if UPPER in Bases: - NEW_LIST.append(UPPER) - # Check for Alt counts - if (UPPER != REF_BASE): - AC = AC + 1 - elif LEN > 1 and x[1] == '-': - D = 'D' - NEW_LIST.append(D) - AC = AC + 1 - elif LEN > 1 and x[1] == '+': - I = 'I' - NEW_LIST.append(I) - AC = AC + 1 - elif x == '*': - NEW_LIST.append('O') - else: - NEW_LIST.append('NA') - - return (NEW_LIST,AC) - -def run_interval(interval,sites,BAM, FASTA, MIN_COV, MIN_CC, tmp_dir, BQ, MQ): - - # Coordinates to analyse - CHROM,START,END = interval.split("_") - START = int(START)-1 - END = int(END)+1 - - # Get pileup read counts from coordinates - bam = pysam.AlignmentFile(BAM) - i = bam.pileup(CHROM, START, END, min_base_quality = BQ, min_mapping_quality = MQ, ignore_overlaps = False) - - # Load reference file. Mandatory to be done inside function to avoid overlap problems during multiprocessing - inFasta = pysam.FastaFile(FASTA) - - # Run it for each position in pileup - CELLS_all = [] - for p in i: - POS=p.pos - - if POS in sites: - # Get reference base from fasta file - ref_base = inFasta.fetch(CHROM, POS, POS+1) - - # Get coverage - DP = p.get_num_aligned() - - CELLS = [] - - # Run only if coverage is more than minimum (arg parameter) - if (DP >= MIN_COV and ref_base.upper() != 'N'): - - # Get pileup info - READS = p.get_query_names() - QUALITIES = p. get_query_qualities() - PILEUP_LIST = p.get_query_sequences(mark_matches=True, add_indels=True) - NEW_PILEUP_LIST,AC = EasyReadPileup(PILEUP_LIST, ref_base) - - # Get reads info - reads = p.pileups - - # Dictionary counters - BASE_COUNTS = ['A', 'C', 'T','G', 'D', 'I'] - - # Check this base in each read - count = 0 - for read_i in range(0, len(READS)): - - read_alignment = reads[read_i].alignment - - try: - barcode = read_alignment.opt("CB") - except: - continue - - # Fix barcode - barcode = barcode.split("-")[0] - - # Remove low quality reads: Seconday alignments, duplicate reads, supplementary alignments - if read_alignment.is_secondary == False and read_alignment.is_duplicate == False and read_alignment.is_supplementary == False: - - # Get exact base - base = NEW_PILEUP_LIST[read_i] - - if base in BASE_COUNTS: # Usually, sites between exones (intronic sites) are marked as > or < . Therefore, we want to ignore them - - # Append cell barcode to check total number of cells - CELLS.append(barcode) - - count=count+1 - - if (count >= MIN_COV): - CELLS = list(set(CELLS)) - - - # Pysam provides a 0-based coordinate. We must sum 1 to this value - CHROM = str(CHROM) - POS_print = str(POS + 1) - REF = str(ref_base) - - DP = str(count) - NC = len(set(CELLS)) - - if (len(CELLS) >= MIN_CC): - # Save results in list of positions - CELLS_all.extend(CELLS) - - inFasta.close() - bam.close() - - counts = dict() - for i in CELLS_all: - counts[i] = counts.get(i, 0) + 1 - - # Return list of positions - ID = '_'.join([str(CHROM), str(START), str(END)]) - out_temp = tmp_dir + '/' + ID + '.SitesPerCell.npy' - return([out_temp,counts]) - - -def initialize_parser(): - parser = argparse.ArgumentParser(description='Script to calculate the number of callable sites per unique cell') - parser.add_argument('--bam', type=str, default=1, help='Tumor bam file to be analysed', required = True) - parser.add_argument('--ref', type=str, default=1, help='Reference genome. *fai must be available in the same folder as reference', required = True) - parser.add_argument('--infile', type=str, default='', help='Base calling file (obtained by BaseCellCalling.step1.py)', required = False) - parser.add_argument('--min_ct1', type=int, default = 2, help='Minimum number of cell types with enough reads to consider a genomic site. Default = 2', required = False) - parser.add_argument('--min_ct2', type=int, default = 2, help='Minimum number of cell types with enough unique cell counts to consider a genomic site. Default = 2', required = False) - parser.add_argument('--out_folder', default = '.', help='Out folder', required = False) - parser.add_argument('--id', help='Prefix of out file. If provided, please use next format: *.[cell_type] . Example: sample1.t_cell. If not provided, it is taken from bam file', required = False) - parser.add_argument('--nprocs',default = 1, help='Number of processes [Default = 1]',required=False,type = int) - parser.add_argument('--bin', type=int, default=50000, help='Bin size for running the analysis [Default 50000]', required = False) - parser.add_argument('--min_dp', type=int, default = 5, help='Minimum coverage to consider the genomic site. Default = 5', required = False) - parser.add_argument('--min_cc', type=int, default = 5, help='Minimum number of cells required to consider a genomic site. Default = 5', required = False) - parser.add_argument('--min_bq', type=int, default = 20, help='Minimum base quality permited for the base counts. Default = 20', required = False) - parser.add_argument('--min_mq', type=int, default = 255, help='Minimum mapping quality required to analyse read. Default = 255', required = False) - parser.add_argument('--tmp_dir', type=str, default = '.', help='Temporary folder for tmp files', required = False) - - return (parser) - -def main(): - - - # 0. Arguments - parser = initialize_parser() - args = parser.parse_args() - - BAM = args.bam - FASTA = args.ref - CORE = args.nprocs - OUTF = args.out_folder - ID = args.id - BIN = args.bin - in_tsv = args.infile - MIN_CT1 = args.min_ct1 - MIN_CT2 = args.min_ct2 - MIN_COV = args.min_dp - MIN_CC = args.min_cc - MIN_BQ = args.min_bq - MIN_MQ = args.min_mq - tmp_dir = args.tmp_dir - - # 0. If out file is empty, take the name of the bam file - if (ID == None): - ID = os.path.basename(BAM) - ID = ID.replace(".bam","") - - # Set outfile name - out_file = OUTF + '/' + str(ID) + ".SitesPerCell.tsv" - print("Outfile: " , out_file , "\n") - - # 1. Check if temp dir is created - if (tmp_dir != '.'): - try: - # Create target Directory - os.mkdir(tmp_dir) - print("Directory " , tmp_dir , " created\n") - except FileExistsError: - print("Directory " , tmp_dir , " already exists\n") - else: - print("Not temp directory specified, using working directory as temp") - - - - # Temp file and folder - temp = tmp_dir + '/target.bed' - - # 1. Concatenate and create temp bed file - print ('-----------------------------------------------------------') - print ('1. Preparing temp file...') - print ('-----------------------------------------------------------\n') - - command = "grep -v '^#' %s | grep -v 'chrM' | awk -F'\\t' -v OFS='\\t' '{if ($19 >= %s && $20 >= %s) {print $1,$3}}'| sort -T %s -k1,1 -k2,2n > %s" % (in_tsv,MIN_CT1, MIN_CT2,tmp_dir,temp) - - # Submit linux command - try: - subprocess.run(command, shell=True) - except subprocess.CalledProcessError as error: - print(error) - - # Split the in_tsv file in windows - DICT_sites = {} - cur_chr = '' - cur_pos = 0 - count = 0 - with open(temp,'r') as tmp: - for line in tmp: - line = line.rstrip('\n') - CHROM, POS = line.split("\t") - POS = int(POS) - - if (cur_chr == ''): - cur_chr = CHROM - cur_pos = POS - LIST=[] - - DIFF = POS-cur_pos - - if CHROM == cur_chr and DIFF < 10000 and count < 1000: - LIST.append(POS-1) # Necessary to subtract one for pysam coordinates - count = count + 1 - else: - ID=str(cur_chr) + '_' + str(cur_pos) + '_' + str(POS) - DICT_sites[ID] = set(LIST) - - # Re-start counting - cur_chr = CHROM - cur_pos = POS - count=0 - LIST= [POS-1] # Necessary to subtract one for pysam coordinates - - # Save results with last iteration in file - if (count > 0): - ID=str(cur_chr) + '_' + str(cur_pos) + '_' + str(POS) - DICT_sites[ID] = set(LIST) - - - - - # 2. Checking bam file - print ('-----------------------------------------------------------') - print ('2. Checking bam file...') - print ('-----------------------------------------------------------\n') - - if (CORE > 1): - pool = mp.Pool(CORE) - - # Step 3.1: Use loop to parallelize - for row in DICT_sites.keys(): - # This funtion writes in temp files the results - pool.apply_async(run_interval, args=(row, DICT_sites[row], BAM, FASTA, MIN_COV, MIN_CC, tmp_dir, MIN_BQ, MIN_MQ), callback=collect_result) - - # Step 3.2: Close Pool and let all the processes complete - pool.close() - pool.join() - else: - for row in DICT_sites.keys(): - # This funtion writes in temp files the results - collect_result(run_interval(row, DICT_sites[row], BAM, FASTA, MIN_COV, MIN_CC, tmp_dir, MIN_BQ, MIN_MQ)) - - - # 4. Write final file - concatenate_sort_temp_files_and_write(out_file, tmp_dir, ID) - - # Remove temp directory - shutil.rmtree(tmp_dir, ignore_errors=True) - -if __name__ == '__main__': - start = timeit.default_timer() - main() - stop = timeit.default_timer() - print ("Computation time: " + str(round(stop - start)) + ' seconds') - - diff --git a/tools/SComatic/scripts/TrinucleotideBackground/TrinucleotideContextBackground.py b/tools/SComatic/scripts/TrinucleotideBackground/TrinucleotideContextBackground.py deleted file mode 100644 index f468b37..0000000 --- a/tools/SComatic/scripts/TrinucleotideBackground/TrinucleotideContextBackground.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/python - -import timeit -import os -import argparse -import subprocess -import time -import itertools -import pandas as pd -import numpy as np - -def longestRun(s): - if len(s) == 0: return 0 - runs = ''.join('*' if x == y else ' ' for x,y in zip(s,s[1:])) - starStrings = runs.split() - if len(starStrings) == 0: return 1 - return 1 + max(len(stars) for stars in starStrings) - -def trincucleotide_geneartor(l): - yield from itertools.product(*([l] * 3)) - -def initialize_parser(): - parser = argparse.ArgumentParser(description='Script to obtain trincucleotide context background') - parser.add_argument('--in_tsv', type=str, help='File listing the tsv files to be used for the trinucleotide context background computation (files obtained in BaseCellCalling.step1.py)', required = True) - parser.add_argument('--out_file', type=str, help='Output file', required = True) - return (parser) - -def main(): - - # 0. Arguments - parser = initialize_parser() - args = parser.parse_args() - - in_tsv = args.in_tsv - outfile = args.out_file - - # Temp file and folder - temp = outfile + '.temp' - temp_folder=os.path.dirname(outfile) - if (temp_folder == ''): - temp_folder = "." - - # 1. Concatenate and create temp calling file - print ('-----------------------------------------------------------') - print ('1. Preparing temp file...') - print ('-----------------------------------------------------------\n') - - command = "for tsv in $(cat %s);do grep -v '^#\\|LC_' $tsv | awk -F'\\t' -v OFS='\\t' '{if ($19 >= 2 && $20 >= 2) {print $4,$8,$9}}'; done | sort -T %s | uniq -c | sed 's/^[ \\t]*//' | tr ' ' '\\t' > %s" % (in_tsv,temp_folder,temp) - # Submit linux command - try: - subprocess.run(command, shell=True) - except subprocess.CalledProcessError as error: - print(error) - - # 2. Checking trinucleotide context background - print ('-----------------------------------------------------------') - print ('2. Checking trinucleotide context background...') - print ('-----------------------------------------------------------\n') - - # Complementary bases - complemetary_dictionary = {'A':'T','T':'A','G':'C','C':'G'} - - # Expected bases - expected_bases = ['C','T'] - - # Generate all possible background reference trinucleotides - all_trinucleotides = {} - for x in trincucleotide_geneartor('ACTG'): - SEQ=''.join(x) - - if SEQ[1] in expected_bases: - all_trinucleotides[SEQ] = 0 - - # Checking and counting trinucleotides - with open(temp,'r') as t: - for line in t: - line = line.rstrip('\n') - COUNT, REF, UP, DOWN = line.split("\t") - COUNT = int(COUNT) - - print (line) - - try: - if (longestRun(UP) < 4 and longestRun(DOWN) < 4): - UP_base = UP[-1] - DOWN_base = DOWN[0] - - if (REF in 'ACTG' and UP_base in 'ACTG' and DOWN_base in 'ACTG'): - TRINUCLEOTIDE = [UP_base, REF, DOWN_base] - - if REF in expected_bases: - trincucleotide = ''.join(TRINUCLEOTIDE) - else: - TRINUCLEOTIDE2 = [complemetary_dictionary[x] for x in TRINUCLEOTIDE] - trincucleotide = ''.join(TRINUCLEOTIDE2) - - all_trinucleotides[trincucleotide] = all_trinucleotides[trincucleotide] + COUNT - except: - continue - - # Transform to data frame - df = pd.DataFrame.from_dict(all_trinucleotides, orient='index') - df.index.name = 'Trincucleotide' - df.reset_index(inplace=True) - SUM = (np.sum(df[0])) - df['Ratio'] = df[0] / float(SUM) - df.rename({0: 'Count'}, axis=1, inplace=True) - df.to_csv(outfile, index=False) - - # Remove temp file - os.remove(temp) - -# ------------------------- -# Main computation -# ------------------------- -if __name__ == '__main__': - start = timeit.default_timer() - main() - stop = timeit.default_timer() - print ('\nTotal computing time: ' + str(round(stop - start,2)) + ' seconds') - - - - diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000..e82fde5 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,29 @@ +# Define common functions and variables +include: 'rules/common.smk' + +#include: 'rules/PoN.smk' +include: 'rules/CellTypeReannotation.smk' +include: 'rules/SNVCalling.smk' +include: 'rules/FusionCalling.smk' +include: 'rules/CellClustering.smk' +#include: 'rules/InferCopyNumbers.smk' + +# Define results working directory +workdir: config['User']['output_dir'] + +rule LongSom_output: + input: + # Reannotated cell types + expand("CellTypeReannotation/ReannotatedCellTypes/{id}.tsv", id=IDS) if REANNO else [], + # Final SNV set + expand("SNVCalling/BaseCellCalling/{id}.calling.step3.tsv", id=IDS) if SCOMATIC else [], + # Fusions + expand("FusionCalling/Somatic/{id}.Fusions.tsv", id=IDS) if CTATFUSION else [], + # Cell-Variants matrix + expand("CellClustering/SingleCellGenotype/{id}.BinaryMatrix.tsv", id=IDS)if BNPC else [], + # Clonal reconstruction + expand("CellClustering/BnpC_output/{id}/genoCluster_posterior_mean_raw.pdf", id=IDS) if BNPC else [], + # CNVs + expand("InferCNV/InferCNV/{id}/infercnv.17_HMM_predHMMi6.leiden.hmm_mode-subclusters.png", id = IDS) if INFERCNV else [], + default_target: True + localrule: True \ No newline at end of file diff --git a/snakefiles/envs/BnpC.yml b/workflow/envs/BnpC.yaml similarity index 90% rename from snakefiles/envs/BnpC.yml rename to workflow/envs/BnpC.yaml index b2ce37c..559c2cf 100644 --- a/snakefiles/envs/BnpC.yml +++ b/workflow/envs/BnpC.yaml @@ -15,4 +15,4 @@ dependencies: - pip - pip: - pywaffle - - matplotlib-venn \ No newline at end of file + - matplotlib-venn diff --git a/snakefiles/envs/SComatic.yml b/workflow/envs/SComatic.yaml similarity index 100% rename from snakefiles/envs/SComatic.yml rename to workflow/envs/SComatic.yaml diff --git a/snakefiles/InferCopyNumbers.smk b/workflow/rules/CNACalling.smk similarity index 100% rename from snakefiles/InferCopyNumbers.smk rename to workflow/rules/CNACalling.smk diff --git a/workflow/rules/CellClustering.smk b/workflow/rules/CellClustering.smk new file mode 100644 index 0000000..8878404 --- /dev/null +++ b/workflow/rules/CellClustering.smk @@ -0,0 +1,176 @@ +### Rules for establishing the genotype of each cells (i.e. SNVs and fusions per cell) +### Then building a cell-variant matrix and clustering cells based on variants (using BnpC) +if PON: + rule SingleCellGenotype: + input: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step3.tsv", + bam=f"{INPUT}/bam/{{id}}.bam", + barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv", + bb="PoN/PoN/BetaBinEstimates.txt", + fusions="FusionCalling/Somatic/{id}.Fusions.SingleCellGenotype.tsv" if CTATFUSION else [], + ref=str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="CellClustering/SingleCellGenotype/{id}.SingleCellGenotype.tsv", + dp="CellClustering/SingleCellGenotype/{id}.DpMatrix.tsv", + alt="CellClustering/SingleCellGenotype/{id}.AltMatrix.tsv", + vaf="CellClustering/SingleCellGenotype/{id}.VAFMatrix.tsv", + bin="CellClustering/SingleCellGenotype/{id}.BinaryMatrix.tsv", + tmp=temp(directory("CellClustering/SingleCellGenotype/{id}/")) + params: + script=str(workflow.basedir)+"/scripts/CellClustering/SingleCellGenotype.py", + alt_flag= config['CellClust']['SingleCellGenotype']['alt_flag'], + mapq=config['SNVCalling']['BaseCellCounter']['min_mapping_quality'], + alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), + beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), + pval = config['CellClust']['SingleCellGenotype']['pvalue'], + chrm_conta = config['SNVCallling']['BaseCellCalling']['chrM_contaminant'], + conda: + "../envs/SComatic.yaml" + threads: 32 + resources: + mem_mb_per_cpu=1024 + log: + "logs/SingleCellGenotype/{id}.log", + benchmark: + "benchmarks/SingleCellGenotype/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --outfile CellClustering/SingleCellGenotype/{wildcards.id} \ + --bam {input.bam} \ + --meta {input.barcodes} \ + --ref {input.ref} \ + --fusions {input.fusions} \ + --nprocs {threads} \ + --min_mq {params.mapq} \ + --pvalue {params.pval} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} \ + --alt_flag {params.alt_flag} \ + --chrM_contaminant {params.chrm_conta} \ + --tmp_dir {output.tmp} + """ +else: + rule SingleCellGenotype: + input: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step3.tsv", + bam=f"{INPUT}/bam/{{id}}.bam", + barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv", + fusions="FusionCalling/Somatic/{id}.Fusions.tsv" if CTATFUSION else [], + ref=str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="CellClustering/SingleCellGenotype/{id}.SingleCellGenotype.tsv", + dp="CellClustering/SingleCellGenotype/{id}.DpMatrix.tsv", + alt="CellClustering/SingleCellGenotype/{id}.AltMatrix.tsv", + vaf="CellClustering/SingleCellGenotype/{id}.VAFMatrix.tsv", + bin="CellClustering/SingleCellGenotype/{id}.BinaryMatrix.tsv", + tmp=temp(directory("CellClustering/SingleCellGenotype/{id}/")) + params: + script=str(workflow.basedir)+"/scripts/CellClustering/SingleCellGenotype.py", + alt_flag= config['CellClust']['SingleCellGenotype']['alt_flag'], + mapq=config['SNVCalling']['BaseCellCounter']['min_mapping_quality'], + alpha2=config['SNVCalling']['BaseCellCalling']['alpha2'], + beta2=config['SNVCalling']['BaseCellCalling']['beta2'], + pval=config['CellClust']['SingleCellGenotype']['pvalue'], + chrm_conta=config['SNVCalling']['BaseCellCalling']['chrM_contaminant'], + conda: + "../envs/SComatic.yaml" + threads: 32 + resources: + mem_mb_per_cpu=1024 + log: + "logs/SingleCellGenotype/{id}.log", + benchmark: + "benchmarks/SingleCellGenotype/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --outfile CellClustering/SingleCellGenotype/{wildcards.id} \ + --bam {input.bam} \ + --meta {input.barcodes} \ + --ref {input.ref} \ + --fusions {input.fusions} \ + --nprocs {threads} \ + --min_mq {params.mapq} \ + --pvalue {params.pval} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} \ + --alt_flag {params.alt_flag} \ + --chrM_contaminant {params.chrm_conta} \ + --tmp_dir {output.tmp} + """ + +rule FormatInputBnpC: + input: + bin="CellClustering/SingleCellGenotype/{id}.BinaryMatrix.tsv", + vaf="CellClustering/SingleCellGenotype/{id}.VAFMatrix.tsv", + ctypes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv", + output: + bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv", + vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv", + ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv", + params: + script=str(workflow.basedir)+"/scripts/CellClustering/FormatInputBnpC.py", + min_cells=config['CellClust']['FormatInput']['min_cells_per_mut'], + min_cov=config['CellClust']['FormatInput']['min_pos_cov'] + conda: + "../envs/BnpC.yaml" + log: + "logs/FormatInputBnpC/{id}.log", + benchmark: + "benchmarks/FormatInputBnpC/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --bin {input.bin} \ + --vaf {input.vaf} \ + --ctypes {input.ctypes} \ + --min_pos_cov {params.min_cov} \ + --min_cells_per_mut {params.min_cells} \ + --outfile CellClustering/BnpC_input//{wildcards.id} + """ + +rule BnpC_clustering: + input: + bin="CellClustering/BnpC_input/{id}.BinaryMatrix.tsv", + vaf="CellClustering/BnpC_input/{id}.VAFMatrix.tsv", + ctypes="CellClustering/BnpC_input/{id}.Barcodes.tsv", + output: + pdf="CellClustering/BnpC_output/{id}/genoCluster_posterior_mean_raw.pdf" + params: + script=str(workflow.basedir)+"/scripts/CellClustering/run_BnpC.py", + mcmc_steps = config['CellClust']['BnpC']['mcmc_steps'], + estimator = config['CellClust']['BnpC']['estimator'], + dpa = config['CellClust']['BnpC']['dpa'], + cup = config['CellClust']['BnpC']['cup'], + eup = config['CellClust']['BnpC']['eup'], + FP = config['CellClust']['BnpC']['FP'], + FN = config['CellClust']['BnpC']['FN'], + pp= config['CellClust']['BnpC']['pp'], + conda: + "../envs/BnpC.yaml" + threads: 16 + resources: + mem_mb_per_cpu=1024 + log: + "logs/BnpC/{id}.log", + benchmark: + "benchmarks/BnpC/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + {input.bin} \ + -n {threads} \ + -o CellClustering/BnpC_output/{wildcards.id} \ + -s {params.mcmc_steps} \ + -e {params.estimator} \ + -cup {params.cup} \ + -eup {params.eup} \ + -FP {params.FP} \ + -FN {params.FN} \ + -pp {params.pp} \ + -ap {params.dpa} \ + --ctypes {input.ctypes} + """ diff --git a/workflow/rules/CellTypeReannotation.smk b/workflow/rules/CellTypeReannotation.smk new file mode 100644 index 0000000..a25f19d --- /dev/null +++ b/workflow/rules/CellTypeReannotation.smk @@ -0,0 +1,456 @@ +### Rules for cell typeReannotation +### First detects SNVs and fusions +### Then defines the High Confidence Cancer variants (HCCV) +### Then cells areReannotated based on their SNV/fusion HCCV mutation status + + +### START PREPROCESSING ### +rule RenameCellTypes: + input: + barcodes=f"{INPUT}/barcodes/{{id}}.tsv" + output: + barcodes="Barcodes/{id}.tsv" + params: + script=str(workflow.basedir)+"/scripts/PreProcessing/RenameCellTypes.py", + cancer_cell_type=config['User']['cancer_cell_type'] + conda: + "../envs/SComatic.yaml" + log: + "logs/RenameCellTypes/{id}.log", + benchmark: + "benchmarks/RenameCellTypes/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --input {input.barcodes} \ + --output {output.barcodes} \ + --cancer_cell_type {params.cancer_cell_type} + """ + +rule SplitBam_Reanno: + input: + bam=f"{INPUT}/bam/{{id}}.bam", + bai=f"{INPUT}/bam/{{id}}.bam.bai", + barcodes="Barcodes/{id}.tsv" + output: + expand("CellTypeReannotation/SplitBam/{{id}}.{celltype}.bam", celltype=['Cancer','Non-Cancer']) + params: + script=str(workflow.basedir)+"/scripts/PreProcessing/SplitBamCellTypes.py", + mapq=config['Reanno']['BaseCellCounter']['min_mapping_quality'], + conda: + "../envs/SComatic.yaml", + log: + "logs/SplitBam_Reanno/{id}.log", + benchmark: + "benchmarks/SplitBam_Reanno/{id}.benchmark.txt", + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --meta {input.barcodes} \ + --id {wildcards.id} \ + --outdir CellTypeReannotation/SplitBam \ + --min_MQ {params.mapq} + """ +### END PREPROCESSING ### + +### START FUSION CALLING ### +rule BamToFastq_Reanno: + input: + bam="CellTypeReannotation/SplitBam/{id}.{celltype}.bam" + output: + fastq=temp("CellTypeReannotation/FusionCalling/{id}/{id}.{celltype}.fastq") + params: + script=str(workflow.basedir)+"/scripts/FusionCalling/BamToFastq.py", + conda: + "../envs/SComatic.yaml" + log: + "logs/BamToFastq_Reanno/{id}.{celltype}.log", + benchmark: + "benchmarks/BamToFastq_Reanno/{id}.{celltype}.benchmark.txt" + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --fastq {output.fastq} + """ + +rule ConcatFastq_Reanno: + input: + expand("CellTypeReannotation/FusionCalling/{{id}}/{{id}}.{celltype}.fastq", celltype=['Cancer','Non-Cancer']) + output: + fastq=temp("CellTypeReannotation/FusionCalling/{id}/{id}.fastq") + conda: + "../envs/SComatic.yaml" + log: + "logs/ConcatFastq_Reanno/{id}.log", + benchmark: + "benchmarks/ConcatFastq_Reanno/{id}.benchmark.txt" + shell: + r""" + cat CellTypeReannotation/FusionCalling/{wildcards.id}/*.fastq >> {output.fastq} + """ + +rule CTATFusion_Reanno: + input: + fastq="CellTypeReannotation/FusionCalling/{id}/{id}.fastq", + output: + tsv="CellTypeReannotation/FusionCalling/{id}/ctat-LR-fusion.fusion_predictions.tsv", + threads: 16 + resources: + mem_mb_per_cpu=4096 + container: + str(workflow.basedir)+"/scripts/FusionCalling/ctat_lr_fusion.v0.13.0.simg" + log: + "logs/CTATFusion_Reanno/{id}.log", + benchmark: + "benchmarks/CTATFusion_Reanno/{id}.benchmark.txt" + shell: + r""" + ctat-LR-fusion \ + -T /output/{input.fastq} \ + --output /output/CellTypeReannotation/FusionCalling/{wildcards.id} \ + --genome_lib_dir /ref \ + --prep_reference \ + --CPU {threads} \ + --vis + """ +### END FUSION CALLING ### + +### START SNV CALLING ### +rule BaseCellCounter_Reanno: + input: + bam="CellTypeReannotation/SplitBam/{id}.{celltype}.bam", + ref=str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="CellTypeReannotation/BaseCellCounter/{id}/{id}.{celltype}.tsv", + tmp=temp(directory("CellTypeReannotation/BaseCellCounter/{id}/temp_{celltype}/")) + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/BaseCellCounter.py", + chrom=config['Reanno']['BaseCellCounter']['chromosomes'], + mapq=config['Reanno']['BaseCellCounter']['min_mapping_quality'], + conda: + "../envs/SComatic.yaml" + threads: 64 + resources: + mem_mb_per_cpu=1024 + log: + "logs/BaseCellCounter_Reanno/{id}.{celltype}.log", + benchmark: + "benchmarks/BaseCellCounter_Reanno/{id}.{celltype}.benchmark.txt" + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --ref {input.ref} \ + --chrom {params.chrom} \ + --out_folder CellTypeReannotation/BaseCellCounter/{wildcards.id}/ \ + --nprocs {threads} \ + --min_mq {params.mapq} \ + --tmp_dir {output.tmp} + """ + +rule MergeCounts_Reanno: + input: + expand("CellTypeReannotation/BaseCellCounter/{{id}}/{{id}}.{celltype}.tsv", + celltype=['Cancer','Non-Cancer']) + output: + tsv="CellTypeReannotation/MergeCounts/{id}.BaseCellCounts.AllCellTypes.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/MergeBaseCellCounts.py", + conda: + "../envs/SComatic.yaml" + log: + "logs/MergeCounts_Reanno/{id}.log", + benchmark: + "benchmarks/MergeCounts_Reanno/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --tsv_folder CellTypeReannotation/BaseCellCounter/{wildcards.id}/ \ + --outfile {output.tsv} + """ +if PON: + rule BaseCellCalling_step1_Reanno: + input: + bb="PoN/PoN/BetaBinEstimates.txt", + tsv="CellTypeReannotation/MergeCounts/{id}.BaseCellCounts.AllCellTypes.tsv", + ref= str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="CellTypeReannotation/BaseCellCalling/{id}.calling.step1.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/aseCellCalling.step1.py", + min_cell_types = config['Reanno']['BaseCellCalling']['Min_cell_types'], + alpha1 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha1'), + beta1 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta1'), + alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), + beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), + conda: + "../envs/SComatic.yaml" + log: + "logs/BaseCellCalling_step1_Reanno/{id}.log", + benchmark: + "benchmarks/BaseCellCalling_step1_Reanno/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --ref {input.ref} \ + --outfile CellTypeReannotation/BaseCellCalling/{wildcards.id} \ + --min_cell_types {params.min_cell_types} \ + --alpha1 {params.alpha1} \ + --beta1 {params.beta1} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} + """ + +else: + rule BaseCellCalling_step1_Reanno: + input: + tsv="CellTypeReannotation/MergeCounts/{id}.BaseCellCounts.AllCellTypes.tsv", + ref=str(workflow.basedir)+config['Reference']['genome'] + output: + tsv="CellTypeReannotation/BaseCellCalling/{id}.calling.step1.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/BaseCellCalling.step1.py", + min_cell_types=config['Reanno']['BaseCellCalling']['Min_cell_types'], + alpha1=config['Reanno']['BaseCellCalling']['alpha1'], + beta1=config['Reanno']['BaseCellCalling']['beta1'], + alpha2=config['Reanno']['BaseCellCalling']['alpha2'], + beta2=config['Reanno']['BaseCellCalling']['beta2'], + conda: + "../envs/SComatic.yaml" + log: + "logs/BaseCellCalling_step1_Reanno/{id}.log", + benchmark: + "benchmarks/BaseCellCalling_step1_Reanno/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --ref {input.ref} \ + --outfile CellTypeReannotation/BaseCellCalling/{wildcards.id} \ + --min_cell_types {params.min_cell_types} \ + --alpha1 {params.alpha1} \ + --beta1 {params.beta1} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} + """ + +rule BaseCellCalling_step2_Reanno: + input: + tsv="CellTypeReannotation/BaseCellCalling/{id}.calling.step1.tsv", + pon_LR="PoN/PoN/PoN_LR.tsv" if PON else [], + output: + tsv="CellTypeReannotation/BaseCellCalling/{id}.calling.step2.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/BaseCellCalling.step2.py", + min_cell_types=config['Reanno']['BaseCellCalling']['Min_cell_types'], + min_distance=config['Reanno']['BaseCellCalling']['min_distance'], + gnomAD_db=str(workflow.basedir)+config['Reference']['gnomAD_db'], + pon_SR = str(workflow.basedir)+config['Reference']['PoN_SR'], + RNA_editing = str(workflow.basedir)+config['Reference']['RNA_editing'], + max_gnomAD_VAF = config['Reanno']['BaseCellCalling']['max_gnomAD_VAF'], + conda: + "../envs/SComatic.yaml" + log: + "logs/BaseCellCalling_step2_Reanno/{id}.log", + benchmark: + "benchmarks/BaseCellCalling_step2_Reanno/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --outfile CellTypeReannotation/BaseCellCalling/{wildcards.id} \ + --editing {params.RNA_editing} \ + --pon_SR {params.pon_SR} \ + --pon_LR {input.pon_LR} \ + --gnomAD_db {params.gnomAD_db} \ + --gnomAD_max {params.max_gnomAD_VAF} \ + --min_distance {params.min_distance} + """ +### END SNV CALLING ### + +### START HCCV CALLING ### +rule HighConfidenceCancerVariants: + input: + tsv="CellTypeReannotation/BaseCellCalling/{id}.calling.step2.tsv" + output: + tsv="CellTypeReannotation/HCCV/{id}.HCCV.tsv" + params: + script=str(workflow.basedir)+"/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py", + min_dp=config['Reanno']['HCCV']['min_depth'], + deltaVAF=config['Reanno']['HCCV']['deltaVAF'], + deltaMCF=config['Reanno']['HCCV']['deltaMCF'], + clust_dist = config['Reanno']['HCCV']['clust_dist'], + conda: + "../envs/SComatic.yaml" + threads: 8 + resources: + mem_mb_per_cpu=1024 + log: + "logs/HighConfidenceCancerVariants/{id}.log", + benchmark: + "benchmarks/HighConfidenceCancerVariants/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --SNVs {input.tsv} \ + --outfile CellTypeReannotation/HCCV/{wildcards.id} \ + --min_dp {params.min_dp} \ + --deltaVAF {params.deltaVAF} \ + --deltaMCF {params.deltaMCF} \ + --clust_dist {params.clust_dist} + """ + +if PON: + rule HCCVSingleCellGenotype: + input: + tsv="CellTypeReannotation/HCCV/{id}.HCCV.tsv", + bam=f"{INPUT}/bam/{{id}}.bam", + barcodes="Barcodes/{id}.tsv", + bb="PoN/PoN/BetaBinEstimates.txt", + ref=str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="CellTypeReannotation/HCCV/{id}.SNVs.SingleCellGenotype.tsv", + tmp=temp(directory("CellTypeReannotation/HCCV/{id}/")) + params: + script=str(workflow.basedir)+"/scripts/CellTypeReannotation/HCCVSingleCellGenotype.py", + alt_flag= config['Reanno']['HCCV']['alt_flag'], + mapq=config['Reanno']['BaseCellCounter']['min_mapping_quality'], + alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), + beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), + pval = config['Reanno']['HCCV']['pvalue'], + chrm_conta = config['Reanno']['HCCV']['chrM_contaminant'] + conda: + "../envs/SComatic.yaml" + threads: 32 + resources: + mem_mb_per_cpu=1024 + log: + "logs/HCCVSingleCellGenotype/{id}.log", + benchmark: + "benchmarks/HCCVSingleCellGenotype/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --infile {input.tsv} \ + --ref {input.ref} \ + --outfile {output.tsv} \ + --meta {input.barcodes} \ + --alt_flag {params.alt_flag} \ + --nprocs {threads} \ + --min_mq {params.mapq} \ + --pvalue {params.pval} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} \ + --chrM_contaminant {params.chrm_conta} \ + --tmp_dir {output.tmp} + """ +else: + rule HCCVSingleCellGenotype: + input: + tsv="CellTypeReannotation/HCCV/{id}.HCCV.tsv", + bam=f"{INPUT}/bam/{{id}}.bam", + barcodes="Barcodes/{id}.tsv", + ref=str(workflow.basedir)+config['Reference']['genome'] + output: + tsv="CellTypeReannotation/HCCV/{id}.SNVs.SingleCellGenotype.tsv", + tmp=temp(directory("CellTypeReannotation/HCCV/{id}/")) + params: + script=str(workflow.basedir)+"/scripts/CellTypeReannotation/HCCVSingleCellGenotype.py", + alt_flag= config['Reanno']['HCCV']['alt_flag'], + mapq=config['Reanno']['BaseCellCounter']['min_mapping_quality'], + alpha2 = config['Reanno']['BaseCellCalling']['alpha2'], + beta2 = config['Reanno']['BaseCellCalling']['beta2'], + pval = config['Reanno']['HCCV']['pvalue'], + chrm_conta = config['Reanno']['HCCV']['chrM_contaminant'] + conda: + "../envs/SComatic.yaml" + threads: 32 + resources: + mem_mb_per_cpu=1024 + log: + "logs/HCCVSingleCellGenotype/{id}.log", + benchmark: + "benchmarks/HCCVSingleCellGenotype/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --infile {input.tsv} \ + --ref {input.ref} \ + --outfile {output.tsv} \ + --meta {input.barcodes} \ + --alt_flag {params.alt_flag} \ + --nprocs {threads} \ + --min_mq {params.mapq} \ + --pvalue {params.pval} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} \ + --chrM_contaminant {params.chrm_conta} \ + --tmp_dir {output.tmp} + """ + +rule HCCVFusions: + input: + fusions="CellTypeReannotation/FusionCalling/{id}/ctat-LR-fusion.fusion_predictions.tsv", + barcodes="Barcodes/{id}.tsv" + output: + fusions="CellTypeReannotation/HCCV/{id}.Fusions.tsv", + long="CellTypeReannotation/HCCV/{id}.Fusions.SingleCellGenotype.tsv" + params: + script=str(workflow.basedir)+"/scripts/FusionCalling/FusionCalling.py", + deltaMCF=config['FusionCalling']['SomaticFusions']['deltaMCF'], + min_ac_reads=config['FusionCalling']['SomaticFusions']['min_ac_reads'], + min_ac_cells=config['FusionCalling']['SomaticFusions']['min_ac_cells'], + max_MCF_noncancer=config['FusionCalling']['SomaticFusions']['max_MCF_noncancer'], + conda: + "../envs/SComatic.yaml" + log: + "logs/HCCV_Fusion/{id}.log", + benchmark: + "benchmarks/HCCV_Fusion/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --fusions {input.fusions} \ + --barcodes {input.barcodes} \ + --min_ac_reads {params.min_ac_reads} \ + --min_ac_cells {params.min_ac_cells} \ + --max_MCF_noncancer {params.max_MCF_noncancer} \ + --deltaMCF {params.deltaMCF} \ + --outdir CellTypeReannotation/HCCV/{wildcards.id} + """ +### END HCCV CALLING ### + +### REANNOTATION ### +rule CellTypeReannotation: + input: + SNVs="CellTypeReannotation/HCCV/{id}.SNVs.SingleCellGenotype.tsv", + fusions="CellTypeReannotation/HCCV/{id}.Fusions.SingleCellGenotype.tsv", + barcodes="Barcodes/{id}.tsv" + output: + barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv" + params: + script=str(workflow.basedir)+"/scripts/CellTypeReannotation/CellTypeReannotation.py", + min_variants = config['Reanno']['Reannotation']['min_variants'], + min_frac = config['Reanno']['Reannotation']['min_fraction'] + conda: + "../envs/SComatic.yaml" + log: + "logs/CellTypeReannotation/{id}.log", + benchmark: + "benchmarks/CellTypeReannotation/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --SNVs {input.SNVs} \ + --fusions {input.fusions} \ + --outfile {output.barcodes} \ + --meta {input.barcodes} \ + --min_variants {params.min_variants} \ + --min_frac {params.min_frac} + """ + diff --git a/workflow/rules/FusionCalling.smk b/workflow/rules/FusionCalling.smk new file mode 100644 index 0000000..f6c89a0 --- /dev/null +++ b/workflow/rules/FusionCalling.smk @@ -0,0 +1,93 @@ +### Rules to call somatic fusions + +rule BamToFastq: + input: + bam="SNVCalling/SplitBam/{id}.{celltype}.bam" + output: + fastq=temp("FusionCalling/{id}/{id}.{celltype}.fastq") + params: + script=str(workflow.basedir)+"/scripts/FusionCalling/BamToFastq.py", + conda: + "../envs/SComatic.yaml" + log: + "logs/BamToFastq/{id}.{celltype}.log", + benchmark: + "benchmarks/BamToFastq/{id}.{celltype}.benchmark.txt" + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --fastq {output.fastq} + """ + +rule ConcatFastq: + input: + expand("FusionCalling/{{id}}/{{id}}.{celltype}.fastq", celltype=['Cancer','Non-Cancer']) + output: + fastq=temp("FusionCalling/{id}/{id}.fastq") + conda: + "../envs/SComatic.yaml" + log: + "logs/ConcatFastq/{id}.log", + benchmark: + "benchmarks/ConcatFastq/{id}.benchmark.txt" + shell: + r""" + cat FusionCalling/{wildcards.id}/*.fastq >> {output.fastq} + """ + +rule CTATFusion: + input: + fastq="FusionCalling/{id}/{id}.fastq" + output: + tsv="FusionCalling/{id}/ctat-LR-fusion.fusion_predictions.tsv", + threads: 16 + resources: + mem_mb_per_cpu=4096 + container: + str(workflow.basedir)+"/scripts/FusionCalling/ctat_lr_fusion.v0.13.0.simg" + log: + "logs/CTATFusion/{id}.log", + benchmark: + "benchmarks/CTATFusion/{id}.benchmark.txt" + shell: + r""" + ctat-LR-fusion \ + -T /output/{input.fastq} \ + --output /output/FusionCalling/{wildcards.id} \ + --genome_lib_dir /ref \ + --prep_reference \ + --CPU {threads} \ + --vis + """ + +rule SomaticFusions: + input: + fusions="FusionCalling/{id}/ctat-LR-fusion.fusion_predictions.tsv", + barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv" if REANNO else "Barcodes/{id}.tsv" + output: + fusions="FusionCalling/Somatic/{id}.Fusions.tsv", + long="FusionCalling/Somatic/{id}.Fusions.SingleCellGenotype.tsv" + params: + script=str(workflow.basedir)+"/scripts/FusionCalling/FusionCalling.py", + deltaMCF=config['FusionCalling']['SomaticFusions']['deltaMCF'], + min_ac_reads=config['FusionCalling']['SomaticFusions']['min_ac_reads'], + min_ac_cells=config['FusionCalling']['SomaticFusions']['min_ac_cells'], + max_MCF_noncancer=config['FusionCalling']['SomaticFusions']['max_MCF_noncancer'], + conda: + "../envs/SComatic.yaml" + log: + "logs/SomaticFusions/{id}.log", + benchmark: + "benchmarks/SomaticFusions/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --fusions {input.fusions} \ + --barcodes {input.barcodes} \ + --min_ac_reads {params.min_ac_reads} \ + --min_ac_cells {params.min_ac_cells} \ + --max_MCF_noncancer {params.max_MCF_noncancer} \ + --deltaMCF {params.deltaMCF} \ + --outdir FusionCalling/Somatic/{wildcards.id} + """ \ No newline at end of file diff --git a/snakefiles/PoN.smk b/workflow/rules/PoN.smk similarity index 100% rename from snakefiles/PoN.smk rename to workflow/rules/PoN.smk diff --git a/workflow/rules/SNVCalling.smk b/workflow/rules/SNVCalling.smk new file mode 100644 index 0000000..06c218a --- /dev/null +++ b/workflow/rules/SNVCalling.smk @@ -0,0 +1,221 @@ +### Rules for calling somatic SNVs + + +rule SplitBam: + input: + bam=f"{INPUT}/bam/{{id}}.bam", + bai=f"{INPUT}/bam/{{id}}.bam.bai", + barcodes="CellTypeReannotation/ReannotatedCellTypes/{id}.tsv" if REANNO else "Barcodes/{id}.tsv", + output: + expand("SNVCalling/SplitBam/{{id}}.{celltype}.bam", celltype=['Cancer','Non-Cancer']) + params: + script=str(workflow.basedir)+"/scripts/PreProcessing/SplitBamCellTypes.py", + mapq=config['SNVCalling']['BaseCellCounter']['min_mapping_quality'], + conda: + "../envs/SComatic.yaml", + log: + "logs/SplitBam/{id}.log", + benchmark: + "benchmarks/SplitBam/{id}.benchmark.txt", + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --meta {input.barcodes} \ + --id {wildcards.id} \ + --outdir SNVCalling/SplitBam \ + --min_MQ {params.mapq} + """ + +rule BaseCellCounter: + input: + bam="SNVCalling/SplitBam/{id}.{celltype}.bam", + ref=str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="SNVCalling/BaseCellCounter/{id}/{id}.{celltype}.tsv", + tmp=temp(directory("SNVCalling/BaseCellCounter/{id}/temp_{celltype}/")) + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/BaseCellCounter.py", + chrom=config['SNVCalling']['BaseCellCounter']['chromosomes'], + mapq=config['SNVCalling']['BaseCellCounter']['min_mapping_quality'], + conda: + "../envs/SComatic.yaml" + threads: 64 + resources: + mem_mb_per_cpu=1024 + log: + "logs/BaseCellCounter/{id}.{celltype}.log", + benchmark: + "benchmarks/BaseCellCounter/{id}.{celltype}.benchmark.txt" + shell: + r""" + python {params.script} \ + --bam {input.bam} \ + --ref {input.ref} \ + --chrom {params.chrom} \ + --out_folder SNVCalling/BaseCellCounter/{wildcards.id}/ \ + --nprocs {threads} \ + --min_mq {params.mapq} \ + --tmp_dir {output.tmp} + """ + +rule MergeCounts: + input: + expand("SNVCalling/BaseCellCounter/{{id}}/{{id}}.{celltype}.tsv", + celltype=['Cancer','Non-Cancer']) + output: + tsv="SNVCalling/MergeCounts/{id}.BaseCellCounts.AllCellTypes.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/MergeBaseCellCounts.py", + conda: + "../envs/SComatic.yaml" + log: + "logs/MergeCounts/{id}.log", + benchmark: + "benchmarks/MergeCounts/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --tsv_folder SNVCalling/BaseCellCounter/{wildcards.id}/ \ + --outfile {output.tsv} + """ + +if PON: + rule BaseCellCalling_step1: + input: + bb="PoN/PoN/BetaBinEstimates.txt", + tsv="SNVCalling/MergeCounts/{id}.BaseCellCounts.AllCellTypes.tsv", + ref=str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step1.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/aseCellCalling.step1.py", + min_cell_types = config['SNVCalling']['BaseCellCalling']['Min_cell_types'], + min_ac_reads = config['SNVCalling']['BaseCellCalling']['min_ac_reads'], + min_ac_cells = config['SNVCalling']['BaseCellCalling']['min_ac_cells'], + alpha1 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha1'), + beta1 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta1'), + alpha2 = lambda w, input: get_BetaBinEstimates(input.bb, 'alpha2'), + beta2 = lambda w, input: get_BetaBinEstimates(input.bb, 'beta2'), + conda: + "../envs/SComatic.yaml" + log: + "logs/BaseCellCalling_step1/{id}.log", + benchmark: + "benchmarks/BaseCellCalling_step1/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --ref {input.ref} \ + --outfile SNVCalling/BaseCellCalling/{wildcards.id} \ + --min_cell_types {params.min_cell_types} \ + --min_ac_reads {params.min_ac_reads} \ + --min_ac_cells {params.min_ac_cells} \ + --alpha1 {params.alpha1} \ + --beta1 {params.beta1} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} + """ + +else: + rule BaseCellCalling_step1: + input: + tsv="SNVCalling/MergeCounts/{id}.BaseCellCounts.AllCellTypes.tsv", + ref=str(workflow.basedir)+config['Reference']['genome'], + output: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step1.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/BaseCellCalling.step1.py", + min_cell_types=config['SNVCalling']['BaseCellCalling']['Min_cell_types'], + min_ac_reads = config['SNVCalling']['BaseCellCalling']['min_ac_reads'], + min_ac_cells = config['SNVCalling']['BaseCellCalling']['min_ac_cells'], + alpha1=config['SNVCalling']['BaseCellCalling']['alpha1'], + beta1=config['SNVCalling']['BaseCellCalling']['beta1'], + alpha2=config['SNVCalling']['BaseCellCalling']['alpha2'], + beta2=config['SNVCalling']['BaseCellCalling']['beta2'], + conda: + "../envs/SComatic.yaml" + log: + "logs/BaseCellCalling_step1/{id}.log", + benchmark: + "benchmarks/BaseCellCalling_step1/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --ref {input.ref} \ + --outfile SNVCalling/BaseCellCalling/{wildcards.id} \ + --min_cell_types {params.min_cell_types} \ + --min_ac_reads {params.min_ac_reads} \ + --min_ac_cells {params.min_ac_cells} \ + --alpha1 {params.alpha1} \ + --beta1 {params.beta1} \ + --alpha2 {params.alpha2} \ + --beta2 {params.beta2} + """ + +rule BaseCellCalling_step2: + input: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step1.tsv", + pon_LR="PoN/PoN/PoN_LR.tsv" if PON else [], + pon_SR = str(workflow.basedir)+config['Reference']['PoN_SR'], + RNA_editing=str(workflow.basedir)+config['Reference']['RNA_editing'], + output: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step2.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/BaseCellCalling.step2.py", + min_cell_types = config['SNVCalling']['BaseCellCalling']['Min_cell_types'], + min_distance = config['SNVCalling']['BaseCellCalling']['min_distance'], + gnomAD_db=str(workflow.basedir)+config['Reference']['gnomAD_db'], + max_gnomAD_VAF = config['SNVCalling']['BaseCellCalling']['max_gnomAD_VAF'], + conda: + "../envs/SComatic.yaml" + log: + "logs/BaseCellCalling_step2/{id}.log", + benchmark: + "benchmarks/BaseCellCalling_step2/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --outfile SNVCalling/BaseCellCalling/{wildcards.id} \ + --editing {input.RNA_editing} \ + --pon_SR {input.pon_SR} \ + --pon_LR {input.pon_LR} \ + --gnomAD_db {params.gnomAD_db} \ + --gnomAD_max {params.max_gnomAD_VAF} \ + --min_distance {params.min_distance} + """ + +rule BaseCellCalling_step3: + input: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step2.tsv" + output: + tsv="SNVCalling/BaseCellCalling/{id}.calling.step3.tsv" + params: + script=str(workflow.basedir)+"/scripts/SNVCalling/BaseCellCalling.step3.py", + deltaVAF=config['SNVCalling']['BaseCellCalling']['deltaVAF'], + deltaMCF=config['SNVCalling']['BaseCellCalling']['deltaMCF'], + chrm_conta = config['SNVCalling']['BaseCellCalling']['chrM_contaminant'], + min_ac_reads = config['SNVCalling']['BaseCellCalling']['min_ac_reads'], + min_ac_cells = config['SNVCalling']['BaseCellCalling']['min_ac_cells'], + clust_dist = config['SNVCalling']['BaseCellCalling']['clust_dist'], + conda: + "../envs/SComatic.yaml" + log: + "logs/BaseCellCalling_step3/{id}.log", + benchmark: + "benchmarks/BaseCellCalling_step3/{id}.benchmark.txt" + shell: + r""" + python {params.script} \ + --infile {input.tsv} \ + --outfile SNVCalling/BaseCellCalling/{wildcards.id} \ + --chrM_contaminant {params.chrm_conta} \ + --deltaVAF {params.deltaVAF} \ + --deltaMCF {params.deltaMCF} \ + --min_ac_reads {params.min_ac_reads} \ + --min_ac_cells {params.min_ac_cells} \ + --clust_dist {params.clust_dist} + """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk new file mode 100644 index 0000000..3ebfca1 --- /dev/null +++ b/workflow/rules/common.smk @@ -0,0 +1,21 @@ +import pandas as pd + +# Run +PON=config['Run']['PoN'] +REANNO=config['Run']['CellTypeReannotation'] +SCOMATIC=config['Run']['SNVCalling'] +CTATFUSION=config['Run']['FusionCalling'] +BNPC=config['Run']['CellClustering'] +INFERCNV=config['Run']['CNACalling'] + +INPUT = config['User']['input_dir'] + +# import sample map and retrieve sample names +samples_table = pd.read_table(config["User"]["sample_map"], header=0) +samples = samples_table.set_index("sample", drop=False) +IDS = samples_table["sample"].tolist() + +def get_BetaBinEstimates(input, value): + df = pd.read_csv(input, sep='\t') + d = df.squeeze().to_dict() + return d[value] \ No newline at end of file diff --git a/tools/InferCNV/gene_ordering_file.txt b/workflow/scripts/CNACalling/gene_ordering_file.txt similarity index 100% rename from tools/InferCNV/gene_ordering_file.txt rename to workflow/scripts/CNACalling/gene_ordering_file.txt diff --git a/tools/InferCNV/scripts/infercnv.R b/workflow/scripts/CNACalling/infercnv.R similarity index 100% rename from tools/InferCNV/scripts/infercnv.R rename to workflow/scripts/CNACalling/infercnv.R diff --git a/tools/InferCNV/scripts/split_by_bc.py b/workflow/scripts/CNACalling/split_by_bc.py similarity index 100% rename from tools/InferCNV/scripts/split_by_bc.py rename to workflow/scripts/CNACalling/split_by_bc.py diff --git a/tools/BnpC/libs/format_input.py b/workflow/scripts/CellClustering/FormatInputBnpC.py similarity index 70% rename from tools/BnpC/libs/format_input.py rename to workflow/scripts/CellClustering/FormatInputBnpC.py index 58a43d7..8571228 100644 --- a/tools/BnpC/libs/format_input.py +++ b/workflow/scripts/CellClustering/FormatInputBnpC.py @@ -5,10 +5,9 @@ import matplotlib.pyplot as plt import numpy as np -def filter_input(bin,vaf,scDNA,ctypes,min_cells_per_mut,min_pos_cov,out_prefix): +def filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix): bin = pd.read_csv(bin,sep='\t',index_col=0,na_values=[3,'.']) vaf = pd.read_csv(vaf,sep='\t',index_col=0,na_values=[3,'.']) - scDNA = pd.read_csv(scDNA,sep='\t',na_values=['.']) ctypes = pd.read_csv(ctypes,sep='\t') #Save fusions: fusions = [i for i in bin.index if '--' in i] @@ -24,35 +23,20 @@ def filter_input(bin,vaf,scDNA,ctypes,min_cells_per_mut,min_pos_cov,out_prefix): idx = list(bin.index) + fusions cols = bin.columns - # Attributing a cmap color to scDNA VAF - cmap = plt.get_cmap('Reds', 100) - cmap.set_bad('grey') - scDNA['TumorColor'] = scDNA['Clone_Tum_VAF'].apply(lambda x: matplotlib.colors.rgb2hex(cmap(x))) - scDNA['NonTumorColor'] = scDNA['Clone_NonTum_VAF'].apply(lambda x: matplotlib.colors.rgb2hex(cmap(x))) - for fusion in fusions: - scDNA.loc[fusion]=pd.Series(dtype=float) - scDNA.loc[fusion,'INDEX'] = fusion - scDNA.loc[fusion,['TumorColor','NonTumorColor']] = 'blue' - # Filter all input files: vaf = vaf.loc[idx,cols] - if not 'chr' in idx[0]: - scDNA['INDEX'] = [i.split('chr')[1] if '--' not in i else i for i in list(scDNA['INDEX'])] - scDNA = scDNA[scDNA['INDEX'].isin(idx)] ctypes = ctypes[ctypes['Index'].isin(cols)] bin = pd.concat([bin,fusions_save[cols]]) # Write bin.to_csv(out_prefix + '.BinaryMatrix.tsv', sep='\t') vaf.to_csv(out_prefix + '.VAFMatrix.tsv', sep='\t') - scDNA.to_csv(out_prefix + '.scDNACloneGenotype.tsv', sep='\t', index = False) ctypes.to_csv(out_prefix + '.Barcodes.tsv', sep='\t', index = False) def initialize_parser(): parser = argparse.ArgumentParser(description='Script to filter BnpC input matrix') parser.add_argument('--bin', type=str, default=1, help='SComatic binary matrix (obtained by SingleCellGenotype.py)', required = True) parser.add_argument('--vaf', type=str, default=1, help='SComatic VAF matrix (obtained by SingleCellGenotype.py)', required = True) - parser.add_argument('--scDNA', type=str, default=1, help='SComatic scDNA clones VAF (obtained by scDNAClonesGenotyping.py)', required = True) parser.add_argument('--ctypes', type=str, default=1, help='Barcode to celltypes (obtained by CellTypeReannotation.py)', required = True) parser.add_argument('--min_cells_per_mut', type=int, default=5, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False) parser.add_argument('--min_pos_cov', type=int, default=3, help='SComatic+CellTypeReannotation base calling file (obtained by BaseCellCalling.step3.py)', required = False) @@ -67,7 +51,6 @@ def main(): bin = args.bin vaf = args.vaf - scDNA = args.scDNA ctypes = args.ctypes min_cells_per_mut = args.min_cells_per_mut min_pos_cov = args.min_pos_cov @@ -77,7 +60,7 @@ def main(): print("Outfile prefix: " , out_prefix , "\n") # 1. Create clinical annotation file - filter_input(bin,vaf,scDNA,ctypes,min_cells_per_mut,min_pos_cov,out_prefix) + filter_input(bin,vaf,ctypes,min_cells_per_mut,min_pos_cov,out_prefix) if __name__ == '__main__': start = timeit.default_timer() diff --git a/tools/BnpC/LICENSE b/workflow/scripts/CellClustering/LICENSE similarity index 100% rename from tools/BnpC/LICENSE rename to workflow/scripts/CellClustering/LICENSE diff --git a/tools/SComatic/scripts/SingleCellGenotype/SingleCellGenotype.py b/workflow/scripts/CellClustering/SingleCellGenotype.py similarity index 98% rename from tools/SComatic/scripts/SingleCellGenotype/SingleCellGenotype.py rename to workflow/scripts/CellClustering/SingleCellGenotype.py index 5ffb81a..23f7f1b 100644 --- a/tools/SComatic/scripts/SingleCellGenotype/SingleCellGenotype.py +++ b/workflow/scripts/CellClustering/SingleCellGenotype.py @@ -326,7 +326,7 @@ def collect_cells_with_fusions(fusion_file): fusions = pd.read_csv(fusion_file, sep='\t') # Create Fusion:barcode index to remove duplicates - fusions['INDEX'] = fusions['FusionName'] + ':' + fusions['barcodes'] + fusions['INDEX'] = fusions['#FusionName'] + ':' + fusions['BC'] # Drop duplicates fusions = fusions.drop_duplicates(subset='INDEX', keep="last") @@ -334,7 +334,7 @@ def collect_cells_with_fusions(fusion_file): # Compute fusions to append them to SingleCellGenotype.tsv lines = [] for i,row in fusions.iterrows(): - l = ['.','.','.','.','.','.','.',row['barcodes'],'.', 1,1,1,'.','.',1,'zzz:'+row['FusionName']] + l = ['.','.','.','.','.','.','.',row['BC'],'.', 1,1,1,'.','.',1,'zzz:'+row['#FusionName']] lines.append(l) return pd.DataFrame(lines) @@ -379,7 +379,7 @@ def pivot_long_dataframe(out_prefix, fusions): Bin_df.to_csv(out_prefix + '.BinaryMatrix.tsv', sep='\t', index=True) def initialize_parser(): - parser = argparse.ArgumentParser(description='Script to get the alleles observed in each unique cell for the variant sites') + parser = argparse.ArgumentParser(description='Script to get the SNV/fusions observed in each unique cell') parser.add_argument('--bam', type=str, default=1, help='Tumor bam file to be analysed', required = True) parser.add_argument('--infile', type=str, default=1, help='Base calling file (obtained by BaseCellCalling.step2.py), ideally only the PASS variants', required = True) parser.add_argument('--ref', type=str, default=1, help='Reference genome. *fai must be available in the same folder as reference', required = True) diff --git a/tools/BnpC/libs/CRP.py b/workflow/scripts/CellClustering/libs/CRP.py similarity index 100% rename from tools/BnpC/libs/CRP.py rename to workflow/scripts/CellClustering/libs/CRP.py diff --git a/tools/BnpC/libs/CRP_learning_errors.py b/workflow/scripts/CellClustering/libs/CRP_learning_errors.py similarity index 100% rename from tools/BnpC/libs/CRP_learning_errors.py rename to workflow/scripts/CellClustering/libs/CRP_learning_errors.py diff --git a/tools/BnpC/libs/MCMC.py b/workflow/scripts/CellClustering/libs/MCMC.py similarity index 100% rename from tools/BnpC/libs/MCMC.py rename to workflow/scripts/CellClustering/libs/MCMC.py diff --git a/tools/BnpC/libs/__init__.py b/workflow/scripts/CellClustering/libs/__init__.py similarity index 100% rename from tools/BnpC/libs/__init__.py rename to workflow/scripts/CellClustering/libs/__init__.py diff --git a/tools/BnpC/libs/dpmmIO.py b/workflow/scripts/CellClustering/libs/dpmmIO.py similarity index 98% rename from tools/BnpC/libs/dpmmIO.py rename to workflow/scripts/CellClustering/libs/dpmmIO.py index 7a3ffa2..5a39c03 100644 --- a/tools/BnpC/libs/dpmmIO.py +++ b/workflow/scripts/CellClustering/libs/dpmmIO.py @@ -276,7 +276,6 @@ def save_similarity(args, inferred, results, out_dir): def save_geno_plots(args, data, data_raw, out_dir, names): ctypes = pd.read_csv(args.ctypes, sep='\t') - scDNA = pd.read_csv(args.scDNA, sep='\t') row_cl = False if args.mut_order: row_cl = list(pd.read_csv(args.mut_order, sep='\t')['INDEX']) @@ -290,11 +289,11 @@ def save_geno_plots(args, data, data_raw, out_dir, names): df_obs = pd.DataFrame(data_raw, index=names[0], columns=names[1]).T pl.plot_raw_data( data_est['genotypes'], df_obs, assignment=data_est['assignment'], - ctypes=ctypes, scDNA=scDNA, out_file=out_file, row_cl=row_cl + ctypes=ctypes, out_file=out_file, row_cl=row_cl ) pl.plot_raw_data( df_obs, df_obs, assignment=data_est['assignment'], - ctypes=ctypes, scDNA=scDNA, out_file=out_file_raw, row_cl=row_cl + ctypes=ctypes, out_file=out_file_raw, row_cl=row_cl ) diff --git a/tools/BnpC/libs/plotting.py b/workflow/scripts/CellClustering/libs/plotting.py similarity index 93% rename from tools/BnpC/libs/plotting.py rename to workflow/scripts/CellClustering/libs/plotting.py index ed4a7f2..fb2a958 100644 --- a/tools/BnpC/libs/plotting.py +++ b/workflow/scripts/CellClustering/libs/plotting.py @@ -64,7 +64,7 @@ def _get_col_order(assignment): def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None, assignment=np.array([]), metric='correlation', row_cl=True, - ctypes=pd.Series(), scDNA=pd.Series()): + ctypes=pd.Series()): data = data_in.copy() data_raw = data_raw_in.copy() @@ -94,8 +94,6 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None, cluster_cols = pd.Series(col_dict, name='clusters', index=col_order) ctypes = ctypes.reindex([ctypes.index[i] for i in col_order]) - #ctypes['BnpC_cluster'] = cluster_cols - #ctypes = ctypes.sort_values(by=['BnpC_cluster','Cancer_Color']) data.columns = np.arange(data_in.shape[1]) data = data[col_order] @@ -132,26 +130,12 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None, annot[data_raw.isnull()] = '-' else: annot = False - # Attributing a cmap color to scDNA VAF - scDNA.index = scDNA['INDEX'] - scDNA = scDNA.reindex(data.index) - scDNASupport_Tumor = list(scDNA['TumorColor']) - scDNASupport_NonTumor = list(scDNA['NonTumorColor']) + cmap = plt.get_cmap('Reds', 2) cmap.set_over('green') cmap.set_bad('grey') - # if not 'CNV_Colors' in ctypes.columns: - # ctypes['CNV_Colors']='black' - # if not 'Cell_Reanno_Colors' in ctypes.columns: - # ctypes['Cell_Reanno_Colors']='black' - # if not 'CNV_Colors' in ctypes.columns: - - # if not 'Seurat_color' in ctypes.columns: - # col_colors=[ctypes['CNV_Colors'],ctypes['Cell_Reanno_Colors'],cluster_cols] - # else: - # col_colors=[ctypes['CNV_Colors'],ctypes['Seurat_color'],ctypes['Cell_Reanno_Colors'],cluster_cols] col_colors=[ctypes['Cancer_Color'],cluster_cols] @@ -163,7 +147,6 @@ def plot_raw_data(data_in, data_raw_in=pd.DataFrame(), out_file=None, cmap=cmap, fmt='', linewidths=0, linecolor='lightgray', col_colors=col_colors, - row_colors= [scDNASupport_Tumor,scDNASupport_NonTumor], colors_ratio=(0.3, 0.3), col_cluster=False, row_cluster=False, figsize=(width, height) ) diff --git a/tools/BnpC/libs/utils.py b/workflow/scripts/CellClustering/libs/utils.py similarity index 100% rename from tools/BnpC/libs/utils.py rename to workflow/scripts/CellClustering/libs/utils.py diff --git a/tools/BnpC/run_BnpC.py b/workflow/scripts/CellClustering/run_BnpC.py similarity index 98% rename from tools/BnpC/run_BnpC.py rename to workflow/scripts/CellClustering/run_BnpC.py index af72dfe..be7df58 100644 --- a/tools/BnpC/run_BnpC.py +++ b/workflow/scripts/CellClustering/run_BnpC.py @@ -58,10 +58,6 @@ def check_PSRF_cutoff(val): '--ctypes', type=str, help='Absolute or relative path(s) to input barcode-to-ctype file' ) - parser.add_argument( - '--scDNA', type=str, - help='Absolute or relative path(s) to input scDNA validation file' - ) parser.add_argument( '--mut_order', type=str, nargs='?', const='', default='', help='Absolute or relative path(s) to input mutation order file' diff --git a/tools/SComatic/scripts/CellTypeReannotation/CellTypeReannotation.py b/workflow/scripts/CellTypeReannotation/CellTypeReannotation.py similarity index 90% rename from tools/SComatic/scripts/CellTypeReannotation/CellTypeReannotation.py rename to workflow/scripts/CellTypeReannotation/CellTypeReannotation.py index c40d0e9..6549e50 100644 --- a/tools/SComatic/scripts/CellTypeReannotation/CellTypeReannotation.py +++ b/workflow/scripts/CellTypeReannotation/CellTypeReannotation.py @@ -23,13 +23,13 @@ def collect_cells_with_fusions(fusion_file): fusions = pd.read_csv(fusion_file, sep='\t') # Create Fusion:barcode index to remove duplicates - fusions['INDEX'] = fusions['FusionName'] + ':' + fusions['barcodes'] + fusions['INDEX'] = fusions['#FusionName'] + ':' + fusions['BC'] # Drop duplicates fusions = fusions.drop_duplicates(subset='INDEX', keep="last") # Collect mutated barcodes - return list(fusions['barcodes']) + return list(fusions['BC']) def collect_cancer_cells(cells_with_SNVs,cells_with_fusions,BC_COV,min_variants,min_frac): @@ -42,7 +42,6 @@ def collect_cancer_cells(cells_with_SNVs,cells_with_fusions,BC_COV,min_variants, mean_variants_mut_per_cell = { k : (v/BC_COV[k] if BC_COV[k]>=min_variants else 0) for k,v in variants_per_cell.items()} # Determine cancer cells base on user-defined # of variants and fraction of mutated variants - #cancer_cells = [k for k,v in variants_per_cell.items() if v>=min_variants if mean_variants_mut_per_cell[k]>=min_frac] cancer_cells = [k for k,v in variants_per_cell.items() if mean_variants_mut_per_cell[k]>=min_frac] return cancer_cells @@ -52,15 +51,15 @@ def write_reannotated_cell_types(cancer_cells,BC_COV_min,bc_file,out_file): bcs = bcs[bcs['Index'].isin(BC_COV_min)] # Save automated annotation - bcs['Automated_Cell_type'] = bcs['Cell_type'] + bcs['Before_Reannotation_cell_type'] = bcs['Cell_type'] # Cell reannotation based on HCCVs - bcs['Reannotated_cell_type'] = ['Cancer' if i in cancer_cells else 'NonCancer' for i in bcs['Index']] + bcs['Reannotated_cell_type'] = ['Cancer' if i in cancer_cells else 'Non-Cancer' for i in bcs['Index']] # Copy reannotated celltypes for SplitBamCellTypes.py compatibility bcs['Cell_type'] = bcs['Reannotated_cell_type'] - bcs['Cancer_Color'] = bcs['Reannotated_cell_type'].apply(lambda x: '#94C773' if x=='NonCancer' else '#8F79A1') + #bcs['Cancer_Color'] = bcs['Reannotated_cell_type'].apply(lambda x: '#94C773' if x=='Non-Cancer' else '#8F79A1') # Writing output file bcs.to_csv(out_file, sep='\t', index = False) diff --git a/tools/SComatic/scripts/HighConfidenceCancerVariants/HCCVSingleCellGenotype.py b/workflow/scripts/CellTypeReannotation/HCCVSingleCellGenotype.py similarity index 100% rename from tools/SComatic/scripts/HighConfidenceCancerVariants/HCCVSingleCellGenotype.py rename to workflow/scripts/CellTypeReannotation/HCCVSingleCellGenotype.py diff --git a/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py b/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py new file mode 100644 index 0000000..4233580 --- /dev/null +++ b/workflow/scripts/CellTypeReannotation/HighConfidenceCancerVariants.py @@ -0,0 +1,291 @@ +#!/usr/bin/python + +import timeit +import argparse +import pandas as pd +import numpy as np + +def HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist): + + #--- + # Command to focus only on high confidence cancer variants (HCCV) + #--- + + # Save comment lines + f2 = open(outfile,'w') + with open(SNVs, 'r') as f: + for line in f: + if line.startswith('#'): + # Extract output file column names + if '#CHROM' in line: + line = line.rstrip('\n') + output_column_names = line.split('\t') + else: + f2.write(line) + else: + break + f2.write('##INFO=HCCV_FILTER,Description=Filter status of the variant site for cell reannotation (high-confidence cancer variants)\n') + f2.close() + + input_df = pd.read_csv(SNVs, sep='\t',comment='#',names=output_column_names) + + # INDEX CHR:POS:ALTBASE + input_df['INDEX'] = input_df['#CHROM'].astype(str) + ':' + input_df['Start'].astype(str) + ':' + input_df['ALT'].str.split(',', n=1, expand=True)[0] + + # Only keeping sites with alternative reads in cancer + input_df = input_df[input_df['Cell_types']!='Non-Cancer'] + + # Filtering multiallelic sites, keeping only one allele if it's 50x larger than the other + # Otherwise deleting the line (it messes with filters later on) + # This is important as SComatic will often detect low expr secondary alt allele in very high expr. (e.g. in chrM) DP, NC, ALT, FILTER, CTYPES, BC, CC, VAF, MCF, CancerInfo, NonCancerInfo + input_df[['ALT', 'FILTER', 'Cell_types', 'Bc', 'Cc', 'VAF', 'MCF','MultiAllelic_filter']] = input_df.apply(lambda x: MultiAllelic_filtering(x['REF'], x['ALT'], x['FILTER'], x['Cell_types'],x['Dp'], x['Nc'], x['Bc'], x['Cc'], x['VAF'], x['MCF'], x['Cancer'], x['Non-Cancer']), axis=1, result_type="expand") + input_df = input_df[input_df['MultiAllelic_filter']=='KEEP'] + input_df = input_df[output_column_names + ['INDEX']] + + #Filter 1: DP filtering + input_df['DP_FILTER'] = input_df.apply(lambda x: + DP_filtering(x['Cancer'],x['Non-Cancer'],min_dp), axis=1) + input_df = input_df[input_df['DP_FILTER']=='PASS'] + + #Special case for chrM due to contaminants: + + # Save chrM candidate SNVs to apply specific filters + chrm_df = input_df[input_df['#CHROM']=='chrM'].copy() + input_df = input_df[input_df['#CHROM']!='chrM'] + # Filtering homopolymeres, GnomAD/RNA editing/PON/clustered sites, + # multiallelic sites and sites with unsufficient # celltypes covered + chrm_df = chrm_df[~chrm_df['FILTER'].str.contains('Min|LR|gnomAD|LC|RNA', regex=True)] + + # Filter 2: Noise filter: + input_df = input_df[~input_df['FILTER'].str.contains('Noisy_site')] #multi allelic sites are filtered above + + # Filter 3: Homopolymer filter: + input_df = input_df[~input_df['FILTER'].str.contains('LC_Upstream|LC_Downstream', regex = True)] + + # Filter 4: gnomAD filter + input_df = input_df[~input_df['FILTER'].str.contains('gnomAD', regex = True)] + + # Filter 5: RNA-Editing filter + input_df = input_df[~input_df['FILTER'].str.contains('RNA_editing_db', regex = True)] + + # Filter 6: PoN filter + input_df = input_df[~input_df['FILTER'].str.contains('PoN', regex = True)] + + #Adding chrM SNVs detected above: + input_df = pd.concat([input_df,chrm_df]) + + # Filter 7: PoN filterDelta VAF and MCF filtering + input_df['HCCV_FILTER'] = input_df.apply(lambda x: + MCF_filtering(x['Cell_types'],x['VAF'], x['MCF'],deltaVAF,deltaMCF), axis=1) + input_df = input_df[input_df['HCCV_FILTER']=='PASS'] + + # Filter 8: Distance filter + input_df['FILTER'] = tag_clustered_SNVs(input_df, clust_dist) + input_df = input_df[~input_df['FILTER'].str.contains('dist', regex = True)] + + # Write output + input_df.to_csv(outfile, sep='\t', index=False, mode='a') + +def MultiAllelic_filtering(REF, ALT, FILTER, CTYPES, DP, NC, BC, CC, VAF, MCF, CancerInfo, NonCancerInfo): + ref_dict = {'A':0, 'C':1, 'T':2, 'G':3} + i_ref = ref_dict[REF] + if 'Multi-allelic' in FILTER or '|' in ALT: + ctypes = CTYPES.split(',') + if len(ctypes)>1: + if ctypes[0]=='Cancer': + i_Cancer = 0 + i_NonCancer = 1 + elif ctypes[1]=='Cancer': + i_Cancer = 1 + i_NonCancer = 0 + + BCS = CancerInfo.split('|')[3].split(':')[:4] + BCS = [int(i) for i in BCS] + BCS[i_ref] = 0 # setting REF to 0 as we only consider ALT + MAX = max(BCS) + index = np.argmax(BCS) + BCS[index] = 0 # removing max to select next "max" + MAX2 = max(BCS) + if not(MAX2/MAX<0.05): # one alt 20x larger than the other) + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' + + ALT_Cancer = 'ACTG'[index] + BC_Cancer = int(CancerInfo.split('|')[3].split(':')[index]) + CC_Cancer = int(CancerInfo.split('|')[2].split(':')[index]) + VAF_Cancer = round(BC_Cancer/int(DP.split(',')[i_Cancer]),4) + MCF_Cancer = round(CC_Cancer/int(NC.split(',')[i_Cancer]),4) + + ALT_NonCancer = 'ACTG'[index] + BC_NonCancer = int(NonCancerInfo.split('|')[3].split(':')[index]) + CC_NonCancer = int(NonCancerInfo.split('|')[2].split(':')[index]) + VAF_NonCancer = round(BC_NonCancer/int(DP.split(',')[i_NonCancer]),4) + MCF_NonCancer = round(CC_NonCancer/int(NC.split(',')[i_NonCancer]),4) + + ALT = ','.join([ALT_NonCancer,ALT_Cancer]) + BC = ','.join([str(BC_NonCancer),str(BC_Cancer)]) + CC = ','.join([str(CC_NonCancer),str(CC_Cancer)]) + VAF = ','.join([str(VAF_NonCancer),str(VAF_Cancer)]) + MCF = ','.join([str(MCF_NonCancer),str(MCF_Cancer)]) + + FILTER = FILTER.replace('Multi-allelic,','') + FILTER = FILTER.replace(',Multi-allelic','') + FILTER = FILTER.replace('Multi-allelic','') + + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' + + elif len(ctypes)==1: + if ctypes[0]=='Cancer': + BCS = CancerInfo.split('|')[3].split(':')[:4] + BCS = [int(i) for i in BCS] + BCS[i_ref] = 0 # setting REF to 0 as we only consider ALT + MAX = max(BCS) + index = np.argmax(BCS) + BCS[index] = 0 # removing max to select next "max" + MAX2 = max(BCS) + if not(MAX2/MAX<0.05): # one alt 20x larger than the other) + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' + + ALT = 'ACTG'[index] + BC = int(CancerInfo.split('|')[3].split(':')[index]) + CC = int(CancerInfo.split('|')[2].split(':')[index]) + VAF = round(BC/int(DP),4) + MCF= round(CC/int(NC),4) + + FILTER = FILTER.replace('Multi-allelic,','') + FILTER = FILTER.replace(',Multi-allelic','') + FILTER = FILTER.replace('Multi-allelic','') + + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' + else: + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' + else: + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' + +def tag_clustered_SNVs(df, clust_dist): + idx = df['INDEX'] + a=[] + for i in idx: + chr,pos,base=i.split(':') + a.append((chr,pos,base)) + b = sorted(a, key=lambda x: (x[0],x[1])) + + trash=[] + + for (chr,pos,base), (chr2,pos2,base2) in zip(b, b[1:]): + if chr==chr2: + if chr == 'chrM': + continue + if abs(int(pos)-int(pos2))= deltaVAFmin and float(MCF) >= deltaMCFmin: + return 'PASS' + else: + return 'Low VAF/MCF' + elif len(ctypes)>1: + VAFs = VAF.split(',') + MCFs = MCF.split(',') + if ctypes[0]=='Cancer': + VAFCancer = float(VAFs[0]) + VAFNonCancer = float(VAFs[1]) + MCFCancer = float(MCFs[0]) + MCFNonCancer = float(MCFs[1]) + elif ctypes[1]=='Cancer': + VAFCancer = float(VAFs[1]) + VAFNonCancer = float(VAFs[0]) + MCFCancer = float(MCFs[1]) + MCFNonCancer = float(MCFs[0]) + + if VAFNonCancer>0.12: + return 'Heterozygous' + + if VAFCancer<0.05: + return 'NonSig' + + # deltaVAF = VAFCancer-VAFNonCancer + deltaMCF = MCFCancer-MCFNonCancer + + # HCCV are variants with high VAF/MCF in cancer and low VAF/MCF in non-cancer cells + # if deltaVAF < deltaVAFmin: + # return 'LowDeltaVAF' + + if deltaMCF < deltaMCFmin: + return 'LowDeltaMCF' + else: + return 'PASS' + else: + return 'NonCancer' + + +def initialize_parser(): + parser = argparse.ArgumentParser(description='Script to perform High-Confidence Cancer Variants calling') + parser.add_argument('--SNVs', type=str, help='', required = True) + parser.add_argument('--outfile', type=str, help='Out file prefix', required = True) + parser.add_argument('--min_dp', type=float, default = 20, help='Minimum depth in both celltypes to call a HCCV', required = True) + parser.add_argument('--deltaVAF', type=float, default = 0.1, help='Delta VAF between cancer and non-cancer cells', required = True) + parser.add_argument('--deltaMCF', type=float, default = 0.4, help='Delta MCF (cancer cell fraction) between cancer and non-cancer cells', required = True) + parser.add_argument('--clust_dist', type=int, default = 10000, help='Minimum distance required between two consecutive SNVs', required = False) + return (parser) + +def main(): + + # 1. Arguments + parser = initialize_parser() + args = parser.parse_args() + + SNVs = args.SNVs + outfile = args.outfile + min_dp = args.min_dp + deltaVAF = args.deltaVAF + deltaMCF = args.deltaMCF + clust_dist = args.clust_dist + + # 1.2: Step 2: Add distance, editing and PoN filters + print ('\n- High Confidence Cancer Variants calling\n') + outfile = outfile + '.HCCV.tsv' + HCCV_SNV(SNVs,outfile,min_dp,deltaVAF,deltaMCF,clust_dist) + +#------------------------- +# Running scRNA somatic variant calling +#------------------------- +if __name__ == '__main__': + start = timeit.default_timer() + main() + stop = timeit.default_timer() + print ('\nTotal computing time: ' + str(round(stop - start,2)) + ' seconds') + + diff --git a/workflow/scripts/FusionCalling/BamToFastq.py b/workflow/scripts/FusionCalling/BamToFastq.py new file mode 100644 index 0000000..8135189 --- /dev/null +++ b/workflow/scripts/FusionCalling/BamToFastq.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +# Adapted from https://github.com/TrinityCTAT/CTAT-LR-fusion/blob/main/util/sc/10x_ubam_to_fastq.py + +import argparse +import pysam +import re + +def bam_to_fastq(bam,fastq): + + with open(fastq, 'w') as f: + samreader = pysam.AlignmentFile(bam, "rb", check_sq=False) + for read in samreader: + + d = read.to_dict() + + read_name = d['name'] + read_seq = d['seq'] + quals = d['qual'] + + # 10x tag descriptions at: + # https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam#:~:text=Barcoded%20BAM%20Tags,-The%20cellranger%20pipeline%20outputs%20an + + CB = "NA" + if read.has_tag("CB"): + cell_barcode = read.get_tag("CB", "Z")[0] + cell_barcode = re.sub("-1$", "", cell_barcode) + + umi = "NA" + if read.has_tag("UB"): + umi = read.get_tag("UB", "Z")[0] + elif len(read_name.split('.'))>=2: + umi = read_name.split('.')[-2][:-3] + + + + read_name = "^".join([cell_barcode, umi, read_name]) + + f.write("@" + read_name + '\n') + f.write(read_seq + '\n') + f.write("+" + '\n') + f.write(quals + '\n') + +def initialize_parser(): + parser = argparse.ArgumentParser(description='Rename cell types in cancer/non-cancer') + parser.add_argument('--bam', type=str, default=1, help='User input barcode file', required = True) + parser.add_argument('--fastq', type=str, default=1, help='Barcode file with redefined cancer/non-cancer celltypes', required = True) + return (parser) + + +def main(): + # 1. Arguments + parser = initialize_parser() + args = parser.parse_args() + + bam=args.bam + fastq=args.fastq + + # 2. Bam to fastq + bam_to_fastq(bam,fastq) + +#------------- +# Execute code +#------------- + +if __name__ == '__main__': + main() diff --git a/workflow/scripts/FusionCalling/FusionCalling.py b/workflow/scripts/FusionCalling/FusionCalling.py new file mode 100644 index 0000000..22cca26 --- /dev/null +++ b/workflow/scripts/FusionCalling/FusionCalling.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +import argparse +import pandas as pd + +def fusion_report(fusions,barcodes,min_ac_reads,min_ac_cells,max_MCF_noncancer,deltaMCF,outdir): + barcodes = pd.read_table(barcodes) + BC_cancer = barcodes[barcodes['Cell_type']=='Cancer']['Index'] + BC_noncancer = barcodes[barcodes['Cell_type']=='Non-Cancer']['Index'] + + df = pd.read_table(fusions) + df = df[df['SpliceType']=='ONLY_REF_SPLICE'] + df['List']=df['LR_accessions'].str.split(',') + + #Rename duplicates (different isoforms of the same gene-gene fusion) + df['#FusionName'] = rename_duplicates(list(df['#FusionName'])) + + # Exploding wide df into long df + df_long = df.explode('List') + df_long[['BC','UMI','ReadName']] = df_long['List'].str.split('^',expand=True) + df_long = df_long[['#FusionName','LeftGene','LeftBreakpoint','RightGene','RightBreakpoint','SpliceType','BC','UMI','ReadName']] + + # num BC per fusion + fusions=list(df['#FusionName']) + + n_bc_cancer={} + n_bc_noncancer={} + n_umi_cancer={} + n_umi_noncancer={} + for fusion in fusions: + df_fus = df_long[df_long['#FusionName']==fusion] + + df_fus_cancer = df_fus[df_fus['BC'].isin(BC_cancer)] + df_fus_noncancer = df_fus[df_fus['BC'].isin(BC_noncancer)] + + n_bc_cancer[fusion] = len(df_fus_cancer['BC'].unique()) + n_bc_noncancer[fusion] = len(df_fus_noncancer['BC'].unique()) + + n_umi_cancer[fusion] = len(df_fus_cancer['UMI'].unique()) + n_umi_noncancer[fusion] = len(df_fus_noncancer['UMI'].unique()) + + df['BC_Cancer'] = df['#FusionName'].map(n_bc_cancer) + df['BC_Non-Cancer'] = df['#FusionName'].map(n_bc_noncancer) + + df['UMI_Cancer'] = df['#FusionName'].map(n_umi_cancer) + df['UMI_Non-Cancer'] = df['#FusionName'].map(n_umi_noncancer) + + # Mutated cells fraction + df['MCF_Cancer'] = df['BC_Cancer']/len(BC_cancer) + df['MCF_Non-Cancer'] = df['BC_Non-Cancer']/len(BC_noncancer) + + df['Filter'] = df.apply(lambda x: filtering(x,min_ac_reads,min_ac_cells,max_MCF_noncancer,deltaMCF), axis=1) + + df = df[['#FusionName','Filter','UMI_Cancer','UMI_Non-Cancer','BC_Cancer','BC_Non-Cancer','MCF_Cancer','MCF_Non-Cancer','LeftGene','LeftLocalBreakpoint','LeftBreakpoint','RightGene','RightLocalBreakpoint','RightBreakpoint','SpliceType']] + + df.to_csv(outdir+'unfiltered.Fusions.tsv',sep='\t',index=False) + + df = df[df['Filter']=='PASS'] + + df.to_csv(outdir+'.Fusions.tsv',sep='\t',index=False) + + df_long = df_long[df_long['#FusionName'].isin(df['#FusionName'])] + df_long = df_long[['#FusionName','LeftGene','LeftBreakpoint','RightGene','RightBreakpoint','SpliceType','BC','UMI','ReadName']] + + df_long.to_csv(outdir+'.Fusions.SingleCellGenotype.tsv',sep='\t',index=False) + + +def filtering(x,min_ac_reads,min_ac_cells,max_MCF_noncancer,deltaMCF): + if x['UMI_Cancer']0: + if x['MCF_Cancer']-x['MCF_Non-Cancer']max_MCF_noncancer: + return 'High_Non-Cancer_MCF' + return 'PASS' + +def rename_duplicates(oldlist): + newlist = [] + for i, v in enumerate(oldlist): + totalcount = oldlist.count(v) + count = oldlist[:i].count(v) + newlist.append(v + str(count + 1) if totalcount > 1 else v) + return newlist + +def initialize_parser(): + parser = argparse.ArgumentParser(description='Report all fusion found in cancer cells and passing filters') + parser.add_argument('--fusions', type=str, default=1, help='Fusions tsv (from ctat-fusion)', required = True) + parser.add_argument('--barcodes', type=str, default=1, help='Barcodes to cell type (re)annotation tsv file', required = True) + parser.add_argument('--min_ac_reads', type=int, default=3, help='Minimum fusion reads in cancer reads', required = True) + parser.add_argument('--min_ac_cells', type=int, default=2, help='Minimum fusion reads in cancer cells', required = True) + parser.add_argument('--max_MCF_noncancer', type=float, default=0.1, help='Maximum mutated cancer non-cancer cell fraction', required = True) + parser.add_argument('--deltaMCF', type=float, default=0.3, help='Difference of mutated cell fraction between cancer and non-cancer cells', required = True) + parser.add_argument('--outdir', type=str, default=1, help='Report tsv', required = True) + return (parser) + +def main(): + # 1. Arguments + parser = initialize_parser() + args = parser.parse_args() + + fusions=args.fusions + barcodes=args.barcodes + min_ac_reads=args.min_ac_reads + min_ac_cells=args.min_ac_cells + max_MCF_noncancer=args.max_MCF_noncancer + deltaMCF=args.deltaMCF + outdir=args.outdir + + # 2. Bam to fasta + fusion_report(fusions,barcodes,min_ac_reads,min_ac_cells,max_MCF_noncancer,deltaMCF,outdir) + + +#------------- +# Execute code +#------------- + +if __name__ == '__main__': + main() diff --git a/tools/SComatic/scripts/BetaBinEstimation/BetaBinEstimation.py b/workflow/scripts/PoN/BetaBinEstimation.py similarity index 100% rename from tools/SComatic/scripts/BetaBinEstimation/BetaBinEstimation.py rename to workflow/scripts/PoN/BetaBinEstimation.py diff --git a/tools/SComatic/scripts/PoN/PoN.py b/workflow/scripts/PoN/PoN.py similarity index 100% rename from tools/SComatic/scripts/PoN/PoN.py rename to workflow/scripts/PoN/PoN.py diff --git a/workflow/scripts/PreProcessing/RenameCellTypes.py b/workflow/scripts/PreProcessing/RenameCellTypes.py new file mode 100644 index 0000000..02d52f7 --- /dev/null +++ b/workflow/scripts/PreProcessing/RenameCellTypes.py @@ -0,0 +1,39 @@ +import pandas as pd +import argparse + +def rename_cell_types(input,output,cancer_cell_type): + barcodes = pd.read_table(input) + barcodes['Input_cell_type'] = barcodes['Cell_type'] + barcodes['Cell_type'] = barcodes['Cell_type'].apply(lambda x: 'Cancer' if x==cancer_cell_type else 'Non-Cancer') + barcodes.to_csv(output,sep='\t',index=False) + + +def initialize_parser(): + parser = argparse.ArgumentParser(description='Rename cell types in cancer/non-cancer') + parser.add_argument('--input', type=str, default=1, help='User input barcode file', required = True) + parser.add_argument('--output', type=str, default=1, help='Barcode file with redefined cancer/non-cancer celltypes', required = True) + parser.add_argument('--cancer_cell_type', type=str, default = 'Sample', help='User defined cancer cell type', required = False) + return (parser) + + +def main(): + # 1. Arguments + parser = initialize_parser() + args = parser.parse_args() + + input=args.input + output=args.output + cancer_cell_type=args.cancer_cell_type + + # 2. Rename cell types + rename_cell_types(input,output,cancer_cell_type) + + +#------------- +# Execute code +#------------- + +if __name__ == '__main__': + main() + + diff --git a/tools/SComatic/scripts/SplitBam/SplitBamCellTypes.py b/workflow/scripts/PreProcessing/SplitBamCellTypes.py similarity index 100% rename from tools/SComatic/scripts/SplitBam/SplitBamCellTypes.py rename to workflow/scripts/PreProcessing/SplitBamCellTypes.py diff --git a/tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step1.py b/workflow/scripts/SNVCalling/BaseCellCalling.step1.py similarity index 98% rename from tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step1.py rename to workflow/scripts/SNVCalling/BaseCellCalling.step1.py index baf634c..6a626e3 100644 --- a/tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step1.py +++ b/workflow/scripts/SNVCalling/BaseCellCalling.step1.py @@ -56,7 +56,7 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea 'Bc':"##INFO=Bc,Description=Number of reads (base count) supporting the variants in the cell type with the mutation", 'Cc':"##INFO=Cc,Description=Number of distinct cells supporting the variant in the cell type with the mutation", 'VAF':"##INFO=VAF,Description=Variant allele frequency of variant in the cell type with the mutation", - 'CCF':"##INFO=CCF,Description=Cancer cell fraction (fraction of ditinct cells) supporting the alternative allele in the cell type with the mutation", + 'MCF':"##INFO=MCF,Description=Cancer cell fraction (fraction of ditinct cells) supporting the alternative allele in the cell type with the mutation", 'BCp':"##INFO=BCp,Description=Beta-binomial p-value for the variant allele (considering read counts)", 'CCp':"##INFO=CCp,Description=Beta-binomial p-value for the variant allele (considering cell counts)", 'Cell_types_min_BC':"##INFO=Cell_types_min_BC,Description=Number of cell types with a minimum number of reads covering a site", @@ -70,7 +70,7 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea outfile.write(DICT[info]+'\n') # Add calling columns - INFO = ['ALT', 'FILTER', 'Cell_types','Up_context', 'Down_context','N_ALT', 'Dp', 'Nc', 'Bc', 'Cc', 'VAF', 'CCF', 'BCp', 'CCp', 'Cell_types_min_BC', 'Cell_types_min_CC', "Rest_BC", "Rest_CC",'Fisher_p', 'Cell_type_Filter'] + INFO = ['ALT', 'FILTER', 'Cell_types','Up_context', 'Down_context','N_ALT', 'Dp', 'Nc', 'Bc', 'Cc', 'VAF', 'MCF', 'BCp', 'CCp', 'Cell_types_min_BC', 'Cell_types_min_CC', "Rest_BC", "Rest_CC",'Fisher_p', 'Cell_type_Filter'] CALLING_INFO = "\t".join(INFO) elements.insert(4,CALLING_INFO) outfile.write('\t'.join(elements)+'\n') @@ -131,7 +131,7 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea # Variant allele frequency (base on read counts) VAF = [] # Cancer cell fraction (VAF base on cell counts) - CCF = [] + MCF = [] # Number of cell types with min_BC Cell_types_min_BC = 0 # Number of cell types with min_CC @@ -246,8 +246,8 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea vaf = "|".join([str(round(Alt_bc_dict[x]/float(DP),4)) for x in Alt_candidates]) VAF.append(vaf) - ccf = "|".join([str(round(Alt_cc_dict[x]/float(NC),4)) for x in Alt_candidates]) - CCF.append(ccf) + mcf = "|".join([str(round(Alt_cc_dict[x]/float(NC),4)) for x in Alt_candidates]) + MCF.append(mcf) # Save information about not candidate cells b0 = sum([int(Alt_bc_dict[x]) for x in Alt_candidates]) @@ -376,8 +376,8 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea CCs = ",".join(CCs) # VAF VAF = ",".join(VAF) - # CCF - CCF = ",".join(CCF) + # MCF + MCF = ",".join(MCF) # Beta-bin p-value of the significant alleles (base counts) BCp = ",".join(BCp) # Beta-bin p-value of the significant alleles (cell counts) @@ -395,7 +395,7 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea Filter = ",".join(Filter) # Create out line with call performed - INFO = [Alts, FILTER, Cell_types, up_context, down_context, str(LEN_Alts), DPs, NCs, BCs, CCs, VAF, CCF, BCp, CCp, Cell_types_min_BC, Cell_types_min_CC, rest_BC, rest_CC, Fisher_p, Filter] + INFO = [Alts, FILTER, Cell_types, up_context, down_context, str(LEN_Alts), DPs, NCs, BCs, CCs, VAF, MCF, BCp, CCp, Cell_types_min_BC, Cell_types_min_CC, rest_BC, rest_CC, Fisher_p, Filter] CALLING_INFO = "\t".join(INFO) elements.insert(4,CALLING_INFO) outfile.write('\t'.join(elements)+'\n') @@ -419,7 +419,7 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea # Variant allele frequency (base on read counts) VAF = "." # Cancer cell fraction (VAF base on cell counts) - CCF = "." + MCF = "." # Fisher Fisher_p = '.' # Rest filter @@ -461,7 +461,7 @@ def variant_calling_step1(file,alpha1,beta1,alpha2,beta2,min_ac_cells,min_ac_rea Cell_types_min_CC = str(Cell_types_min_CC) # Create out line with call performed - INFO = [Alts, FILTER, Cell_types, up_context, down_context, str(LEN_Alts), DPs, NCs, BCs, CCs, VAF, CCF, BCp, CCp, Cell_types_min_BC, Cell_types_min_BC, rest_BC, rest_CC,Fisher_p, Filter] + INFO = [Alts, FILTER, Cell_types, up_context, down_context, str(LEN_Alts), DPs, NCs, BCs, CCs, VAF, MCF, BCp, CCp, Cell_types_min_BC, Cell_types_min_BC, rest_BC, rest_CC,Fisher_p, Filter] CALLING_INFO = "\t".join(INFO) elements.insert(4,CALLING_INFO) outfile.write('\t'.join(elements)+'\n') diff --git a/tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step2.py b/workflow/scripts/SNVCalling/BaseCellCalling.step2.py similarity index 98% rename from tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step2.py rename to workflow/scripts/SNVCalling/BaseCellCalling.step2.py index 3e6fb24..51de229 100644 --- a/tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step2.py +++ b/workflow/scripts/SNVCalling/BaseCellCalling.step2.py @@ -239,8 +239,8 @@ def initialize_parser(): parser.add_argument('--infile', type=str, help='Input file with all samples merged in a single tsv', required = True) parser.add_argument('--outfile', type=str, help='Out file prefix', required = True) parser.add_argument('--editing', type=str, help='RNA editing file to be used to remove RNA-diting sites', required = False) - parser.add_argument('--pon_SR', type=str, help='Short-read (SR) Panel of normals (PoN) file to be used to remove germline polymorphisms and recurrent artefacts', required = False) - parser.add_argument('--pon_LR', type=str, help='Long-read (LR) Panel of normals (PoN) file to be used to remove germline polymorphisms and recurrent artefacts', required = False) + parser.add_argument('--pon_SR', type=str, help='Short-read (SR) Panel of normals (PoN) file to be used to remove germline polymorphisms and recurrent artefacts', required = True) + parser.add_argument('--pon_LR', type=str, help='Long-read (LR) Panel of normals (PoN) file to be used to remove germline polymorphisms and recurrent artefacts', nargs='?', const='', required = False) parser.add_argument('--min_distance', type=int, default = 5, help='Minimum distance allowed between potential somatic variants [Default: 5]', required = False) parser.add_argument('--gnomAD_db', type=str, help='gnomAD v4 database file', required = False) parser.add_argument('--gnomAD_max', type=float, default = 0.01, help='Maximum gnomAD population VAF [default 0.01]', required = False) diff --git a/tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step3.py b/workflow/scripts/SNVCalling/BaseCellCalling.step3.py similarity index 57% rename from tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step3.py rename to workflow/scripts/SNVCalling/BaseCellCalling.step3.py index 273780e..6363220 100644 --- a/tools/SComatic/scripts/BaseCellCalling/BaseCellCalling.step3.py +++ b/workflow/scripts/SNVCalling/BaseCellCalling.step3.py @@ -5,7 +5,7 @@ import pandas as pd import numpy as np -def variant_calling_step3(file,out_prefix,deltaVAF,deltaCCF,cancer,chrM_conta,min_ac_reads,min_ac_cells,clust_dist): +def variant_calling_step3(file,out_prefix,deltaVAF,deltaMCF,chrM_conta,min_ac_reads,min_ac_cells,clust_dist): #--- # Command to focus only on high confidence cancer variants (HCCV) @@ -36,29 +36,28 @@ def variant_calling_step3(file,out_prefix,deltaVAF,deltaCCF,cancer,chrM_conta,mi input_df['INDEX'] = input_df['#CHROM'].astype(str) + ':' + input_df['Start'].astype(str) + ':' + input_df['ALT'].str.split(',', n=1, expand=True)[0] # Only keeping sites with alternative reads in cancer - input_df = input_df[input_df['Cell_types']!='NonCancer'] + input_df = input_df[input_df['Cell_types']!='Non-Cancer'] # Filtering multiallelic sites, keeping only one allele if it's 50x larger than the other # Otherwise deleting the line (it messes with filters later on) # This is important as SComatic will often detect low expr secondary alt allele in very high expr. (e.g. in chrM) - input_df[['ALT', 'FILTER', 'Cell_types', 'Bc', 'Cc', 'VAF', 'CCF','MultiAllelic_filter']] = input_df.apply(lambda x: MultiAllelic_filtering(x['ALT'], x['FILTER'], x['Cell_types'],x['Bc'], x['Cc'], x['VAF'], x['CCF']), axis=1, result_type="expand") + input_df[['ALT', 'FILTER', 'Cell_types', 'Bc', 'Cc', 'VAF', 'MCF','MultiAllelic_filter']] = input_df.apply(lambda x: MultiAllelic_filtering(x['REF'], x['ALT'], x['FILTER'], x['Cell_types'],x['Dp'], x['Nc'], x['Bc'], x['Cc'], x['VAF'], x['MCF'], x['Cancer'], x['Non-Cancer']), axis=1, result_type="expand") input_df = input_df[input_df['MultiAllelic_filter']=='KEEP'] input_df = input_df[output_column_names + ['INDEX']] #Special case for chrM due to contaminants: - if chrM_conta == 'True': - # Save chrM candidate SNVs to apply specific filters - chrm_df = input_df[input_df['#CHROM']=='chrM'].copy() - input_df = input_df[input_df['#CHROM']!='chrM'] - # Filtering homopolymeres, GnomAD/RNA editing/PON/clustered sites, - # multiallelic sites and sites with unsufficient # celltypes covered - chrm_df = chrm_df[~chrm_df['FILTER'].str.contains('Min|LR|gnomAD|LC|RNA', regex=True)] - if len(chrm_df)>0: - # Applying deltaVAF and deltaCCF filters - chrm_df['FILTER'] = chrm_df.apply(lambda x: - chrM_filtering(x['Cell_types'],x['Dp'],x['VAF'], x['CCF'],deltaVAF,deltaCCF,cancer), axis=1) - else: - chrM_conta = 'False' + + # Save chrM candidate SNVs to apply specific filters + chrm_df = input_df[input_df['#CHROM']=='chrM'].copy() + input_df = input_df[input_df['#CHROM']!='chrM'] + # Filtering homopolymeres, GnomAD/RNA editing/PON/clustered sites, + # multiallelic sites and sites with unsufficient # celltypes covered + chrm_df = chrm_df[~chrm_df['FILTER'].str.contains('Min|LR|gnomAD|LC|RNA', regex=True)] + if len(chrm_df)>0: + # Applying deltaVAF and deltaMCF filters + chrm_df['FILTER'] = chrm_df.apply(lambda x: + chrM_filtering(x['Cell_types'],x['Dp'],x['VAF'], x['MCF'],deltaVAF,deltaMCF), axis=1) + # Filter 1: Non-cancer coverage filter input_df = input_df[~input_df['FILTER'].str.contains('Min_cell_types')] @@ -72,20 +71,17 @@ def variant_calling_step3(file,out_prefix,deltaVAF,deltaCCF,cancer,chrM_conta,mi # Filter 3: Betabinomial filter in cancer cells input_df = input_df[~input_df['Cell_type_Filter'].str.contains(',Non-Significant|,Low-Significant', regex = True)] - input_df = input_df[~input_df['Cell_type_Filter'].isin(['Non-Significant','Non-Significant'])] + input_df = input_df[~input_df['Cell_type_Filter'].isin(['Non-Significant','Low-Significant'])] #Adding chrM SNVs detected above: - if chrM_conta == 'True': - unfiltered_df = pd.concat([input_df,chrm_df]) - unfiltered_df.to_csv(out_prefix+ '.calling.step3.unfiltered.tsv', sep='\t', index=False, mode='a') - else: - input_df.to_csv(out_prefix+ '.calling.step3.unfiltered.tsv', sep='\t', index=False, mode='a') + unfiltered_df = pd.concat([input_df,chrm_df]) + unfiltered_df.to_csv(out_prefix+ '.calling.step3.unfiltered.tsv', sep='\t', index=False, mode='a') # Filter 4: Noise filter: input_df = input_df[~input_df['FILTER'].str.contains('Noisy_site')] #multi allelic sites are filtered above # Filter 5: Homopolymer filter: - input_df = input_df[~input_df['FILTER'].str.contains('LC_Upstream|LC_Downstream', regex = True)] #multi allelic sites are filtered above + input_df = input_df[~input_df['FILTER'].str.contains('LC_Upstream|LC_Downstream', regex = True)] # Filter 6:RNA-Editing filter input_df = input_df[~input_df['FILTER'].str.contains('RNA_editing_db', regex = True)] @@ -105,8 +101,7 @@ def variant_calling_step3(file,out_prefix,deltaVAF,deltaCCF,cancer,chrM_conta,mi input_df = input_df[~input_df['FILTER'].str.contains('dist', regex = True)] #Adding chrM SNVs detected above: - if chrM_conta == 'True': - input_df = pd.concat([input_df,chrm_df]) + input_df = pd.concat([input_df,chrm_df]) #Filtering PASS SNVs (only left in chrM) filtered_df = input_df[input_df['FILTER'] =='PASS'] @@ -114,109 +109,125 @@ def variant_calling_step3(file,out_prefix,deltaVAF,deltaCCF,cancer,chrM_conta,mi # Write output filtered_df.to_csv(out_prefix+ '.calling.step3.tsv', sep='\t', index=False, mode='a') -def chrM_filtering(CTYPES,DP,VAF,CCF,deltaVAFmin,deltaCCFmin,cancer): +def chrM_filtering(CTYPES,DP,VAF,MCF,deltaVAFmin,deltaMCFmin): ctypes = CTYPES.split(',') - VAF=VAF - CCF=CCF - DP=DP if len(ctypes)>1: + if ctypes[0]=='Cancer': + i_Cancer = 0 + i_NonCancer = 1 + elif ctypes[1]=='Cancer': + i_Cancer = 1 + i_NonCancer = 0 DP1,DP2=DP.split(',') #Only look at higher depth loci if int(DP1)<100 or int(DP2)<100: return 'LowDepth' else: VAFs = VAF.split(',') - VAF1 = float(VAFs[0]) - VAF2 = float(VAFs[1]) - deltaVAF = VAF1-VAF2 - CCFs = CCF.split(',') - CCF1 = float(CCFs[0]) - CCF2 = float(CCFs[1]) - deltaCCF = CCF1-CCF2 - if ctypes[0]==cancer: - #If Non-cancer VAF is high, it's likely an heterozygous site - if deltaVAF < deltaVAFmin: - return 'LowDeltaVAF' - # HCCF are variants with high VAF in cancer and low VAF in non-cancer cells - elif deltaCCF < deltaCCFmin: - return 'LowDeltaCCF' - else: - return 'PASS' - #Same as above, but with inversed cell types. + MCFs = MCF.split(',') + VAFCancer = float(VAFs[i_Cancer]) + VAFNonCancer = float(VAFs[i_NonCancer]) + deltaVAF = VAFCancer-VAFNonCancer + + MCFCancer = float(MCFs[i_Cancer]) + MCFNonCancer = float(MCFs[i_NonCancer]) + deltaMCF = MCFCancer-MCFNonCancer + + # mtSNVs are variants with high VAF in cancer and low VAF in non-cancer cells + if deltaVAF < deltaVAFmin: + return 'LowDeltaVAF' + + elif deltaMCF < deltaMCFmin: + return 'LowDeltaMCF' else: - if -deltaVAF < deltaVAFmin: - return 'LowDeltaVAF' - elif -deltaCCF < deltaCCFmin: - return 'LowDeltaCCF' - else: - return 'PASS' - else: - if ctypes[0]!=cancer: - return 'NonCancer' - elif int(DP)<50: + return 'PASS' + + elif len(ctypes)==1: + if ctypes[0]!="Cancer": + return 'Non-Cancer' + elif int(DP)<100: return 'LowDepth' - elif float(VAF)<0.1: + elif float(VAF)<0.05: return 'LowVAF' - elif float(CCF)<0.1: - return 'LowCCF' + elif float(MCF)<0.05: + return 'LowMCF' else: return 'PASS' -def MultiAllelic_filtering(ALT, FILTER, CTYPES, BC, CC, VAF, CCF): +def MultiAllelic_filtering(REF, ALT, FILTER, CTYPES, DP, NC, BC, CC, VAF, MCF, CancerInfo, NonCancerInfo): + ref_dict = {'A':0, 'C':1, 'T':2, 'G':3} + i_ref = ref_dict[REF] + if 'Multi-allelic' in FILTER or '|' in ALT: + ctypes = CTYPES.split(',') + if len(ctypes)>1: + if ctypes[0]=='Cancer': + i_Cancer = 0 + i_NonCancer = 1 + elif ctypes[1]=='Cancer': + i_Cancer = 1 + i_NonCancer = 0 + + BCS = CancerInfo.split('|')[3].split(':')[:4] + BCS = [int(i) for i in BCS] + BCS[i_ref] = 0 # setting REF to 0 as we only consider ALT + MAX = max(BCS) + index = np.argmax(BCS) + BCS[index] = 0 # removing max to select next "max" + MAX2 = max(BCS) + if not(MAX2/MAX<0.05): # one alt 20x larger than the other) + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' + + ALT_Cancer = 'ACTG'[index] + BC_Cancer = int(CancerInfo.split('|')[3].split(':')[index]) + CC_Cancer = int(CancerInfo.split('|')[2].split(':')[index]) + VAF_Cancer = round(BC_Cancer/int(DP.split(',')[i_Cancer]),4) + MCF_Cancer = round(CC_Cancer/int(NC.split(',')[i_Cancer]),4) + + ALT_NonCancer = 'ACTG'[index] + BC_NonCancer = int(NonCancerInfo.split('|')[3].split(':')[index]) + CC_NonCancer = int(NonCancerInfo.split('|')[2].split(':')[index]) + VAF_NonCancer = round(BC_NonCancer/int(DP.split(',')[i_NonCancer]),4) + MCF_NonCancer = round(CC_NonCancer/int(NC.split(',')[i_NonCancer]),4) - if len(ALT.split('|'))>1: - if len(CTYPES.split(','))>1: - if len(BC.split(',')[1].split('|'))>1: - BCS = [int(i) for i in BC.split(',')[1].split('|')] + ALT = ','.join([ALT_NonCancer,ALT_Cancer]) + BC = ','.join([str(BC_NonCancer),str(BC_Cancer)]) + CC = ','.join([str(CC_NonCancer),str(CC_Cancer)]) + VAF = ','.join([str(VAF_NonCancer),str(VAF_Cancer)]) + MCF = ','.join([str(MCF_NonCancer),str(MCF_Cancer)]) + + FILTER = FILTER.replace('Multi-allelic,','') + FILTER = FILTER.replace(',Multi-allelic','') + FILTER = FILTER.replace('Multi-allelic','') + + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' + + elif len(ctypes)==1: + if ctypes[0]=='Cancer': + BCS = CancerInfo.split('|')[3].split(':')[:4] + BCS = [int(i) for i in BCS] + BCS[i_ref] = 0 # setting REF to 0 as we only consider ALT MAX = max(BCS) index = np.argmax(BCS) BCS[index] = 0 # removing max to select next "max" MAX2 = max(BCS) - if not(MAX2/MAX<0.02): # one alt 50x larger than the other) - return ALT, FILTER, CTYPES, BC, CC, VAF, CCF,'DELETE' - else: - return ALT, FILTER, CTYPES, BC, CC, VAF, CCF,'DELETE' - - ALT_Cancer = ALT.split(',')[1].split('|')[index] - BC_Cancer = BC.split(',')[1].split('|')[index] - CC_Cancer = CC.split(',')[1].split('|')[index] - VAF_Cancer = VAF.split(',')[1].split('|')[index] - CCF_Cancer = CCF.split(',')[1].split('|')[index] - try: - ALT_NonCancer = ALT.split(',')[0].split('|')[index] - BC_NonCancer = BC.split(',')[0].split('|')[index] - CC_NonCancer = CC.split(',')[0].split('|')[index] - VAF_NonCancer = VAF.split(',')[0].split('|')[index] - CCF_NonCancer = CCF.split(',')[0].split('|')[index] - except: - ALT_NonCancer = ALT.split(',')[0].split('|')[0] - BC_NonCancer = BC.split(',')[0].split('|')[0] - CC_NonCancer = CC.split(',')[0].split('|')[0] - VAF_NonCancer = VAF.split(',')[0].split('|')[0] - CCF_NonCancer = CCF.split(',')[0].split('|')[0] - ALT = ','.join([ALT_NonCancer,ALT_Cancer]) - BC = ','.join([BC_NonCancer,BC_Cancer]) - CC = ','.join([CC_NonCancer,CC_Cancer]) - VAF = ','.join([VAF_NonCancer,VAF_Cancer]) - CCF = ','.join([CCF_NonCancer,CCF_Cancer]) - return ALT, FILTER, CTYPES, BC, CC, VAF, CCF,'KEEP' - else: - BCS = [int(i) for i in BC.split('|')] - if min(BCS)/max(BCS)<0.02: # one alt 50x larger than the other) - index = np.argmax(BCS) + if not(MAX2/MAX<0.05): # one alt 20x larger than the other) + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' + + ALT = 'ACTG'[index] + BC = int(CancerInfo.split('|')[3].split(':')[index]) + CC = int(CancerInfo.split('|')[2].split(':')[index]) + VAF = round(BC/int(DP),4) + MCF= round(CC/int(NC),4) + + FILTER = FILTER.replace('Multi-allelic,','') + FILTER = FILTER.replace(',Multi-allelic','') + FILTER = FILTER.replace('Multi-allelic','') + + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' else: - return ALT, FILTER, CTYPES, BC, CC, VAF, CCF,'DELETE' - - ALT = ALT.split('|')[index] - BC = BC.split('|')[index] - CC = CC.split('|')[index] - VAF = VAF.split('|')[index] - CCF = CCF.split('|')[index] - return ALT, FILTER, CTYPES, BC, CC, VAF, CCF,'KEEP' + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'DELETE' else: - if 'Multi-allelic' in FILTER: #Dissonant alleles - return ALT, FILTER, CTYPES, BC, CC, VAF, CCF,'DELETE' - return ALT, FILTER, CTYPES, BC, CC, VAF, CCF,'KEEP' + return ALT, FILTER, CTYPES, BC, CC, VAF, MCF,'KEEP' def tag_clustered_SNVs(df, clust_dist): df2 = df[df['FILTER']=='PASS'] @@ -258,8 +269,7 @@ def initialize_parser(): parser.add_argument('--infile', type=str, help='Input file with all samples merged in a single tsv', required = True) parser.add_argument('--outfile', type=str, help='Out file prefix', required = True) parser.add_argument('--deltaVAF', type=float, default = 0.3, help='Delta VAF between cancer and non-cancer cells', required = True) - parser.add_argument('--deltaCCF', type=float, default = 0.3, help='Delta CCF (cancer cell fraction) between cancer and non-cancer cells', required = True) - parser.add_argument('--cancer_ctype', type=str, default = '', help='Name of cancer cell type in meta file', required = False) + parser.add_argument('--deltaMCF', type=float, default = 0.3, help='Delta MCF (cancer cell fraction) between cancer and non-cancer cells', required = True) parser.add_argument('--chrM_contaminant', type=str, default = 'True', help='Use this option if chrM contaminants are observed in non-cancer cells', required = False) parser.add_argument('--min_ac_reads', type=int, default = 2, help='Minimum ALT reads', required = False) parser.add_argument('--min_ac_cells', type=int, default = 3, help='Minimum mutated cells', required = False) @@ -275,8 +285,7 @@ def main(): infile = args.infile out_prefix = args.outfile deltaVAF = args.deltaVAF - deltaCCF = args.deltaCCF - cancer_ctype = args.cancer_ctype + deltaMCF = args.deltaMCF chrM_conta = args.chrM_contaminant min_ac_reads = args.min_ac_reads min_ac_cells = args.min_ac_cells @@ -284,7 +293,7 @@ def main(): # 1.1: Step 3: Filter only PASS mutations, not within clust_dist of each other, and apply specific chrM filters print ('\n- Variant calling step 3\n') - variant_calling_step3(infile,out_prefix,deltaVAF,deltaCCF,cancer_ctype,chrM_conta,min_ac_reads,min_ac_cells,clust_dist) + variant_calling_step3(infile,out_prefix,deltaVAF,deltaMCF,chrM_conta,min_ac_reads,min_ac_cells,clust_dist) #------------------------- # Running scRNA somatic variant calling diff --git a/tools/SComatic/scripts/BaseCellCounter/BaseCellCounter.py b/workflow/scripts/SNVCalling/BaseCellCounter.py similarity index 100% rename from tools/SComatic/scripts/BaseCellCounter/BaseCellCounter.py rename to workflow/scripts/SNVCalling/BaseCellCounter.py diff --git a/tools/SComatic/scripts/MergeCounts/MergeBaseCellCounts.py b/workflow/scripts/SNVCalling/MergeBaseCellCounts.py similarity index 98% rename from tools/SComatic/scripts/MergeCounts/MergeBaseCellCounts.py rename to workflow/scripts/SNVCalling/MergeBaseCellCounts.py index d387875..6b7acbb 100644 --- a/tools/SComatic/scripts/MergeCounts/MergeBaseCellCounts.py +++ b/workflow/scripts/SNVCalling/MergeBaseCellCounts.py @@ -196,10 +196,10 @@ def merge_cell_types_files(infiles,outfile): else: #print lines if should_we_go(lines): - cur_chr = update_chr(chrs) #routine to update the current chr, should check that's the same for all - cur_pos = 0 + cur_chr = update_chr(chrs) #routine to update the current chr, should check that's the same for all + cur_pos = 0 - pass + pass outfile.close()