From 338bf229439823e9c1772f82ccfde32e0011edad Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Tue, 26 Oct 2021 16:06:12 +0200 Subject: [PATCH 01/33] [FEATURE] nested attribute access for config dictionary in vpipe --- workflow/rules/common.smk | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 29956ee2a..0bdf1a7f9 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -245,15 +245,19 @@ def process_config(config): section[entry] = value.format(VPIPE_BASEDIR=VPIPE_BASEDIR) """ - The following hack supports `.` access to dictionary entries, e.g. - config.input instead of config["input"] so we don't have to change all - existing rules. - This works only on the upper-level of config and not for the nested - dictionaries. + The following hack supports `.` access to recursive dictionary entries, + e.g. config.input instead of config["input"] so we don't have to change + all existing rules. """ - config = UserDict(config) - config.__dict__.update(config) - return config + def wrap(dd): + if isinstance(dd, dict): + dd = UserDict(dd) + for k, v in dd.items(): + dd[k] = wrap(v) + dd.__dict__.update(dd) + return dd + + return wrap(config) config = process_config(config) From 31673770da7ecb68373c49f65650ca841410c094 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Tue, 26 Oct 2021 16:09:27 +0200 Subject: [PATCH 02/33] first attempt to write summary.json --- workflow/Snakefile | 2 ++ workflow/envs/report_sequences.yaml | 4 +++ workflow/rules/common.smk | 4 +++ workflow/rules/consensus.smk | 2 ++ workflow/rules/publish.smk | 12 +++++++++ workflow/schemas/config_schema.json | 10 ++++++++ workflow/scripts/report_sequences.py | 37 ++++++++++++++++++++++++++++ 7 files changed, 71 insertions(+) create mode 100644 workflow/envs/report_sequences.yaml create mode 100644 workflow/rules/publish.smk create mode 100644 workflow/scripts/report_sequences.py diff --git a/workflow/Snakefile b/workflow/Snakefile index f44861368..2655cd1b0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,6 +19,7 @@ functions = srcdir("scripts/functions.sh") rule all: input: all_files, + "summary.json" rule alltrimmed: @@ -37,6 +38,7 @@ include: "rules/align.smk" include: "rules/consensus.smk" include: "rules/mafs.smk" include: "rules/stats.smk" +include: "rules/publish.smk" if config.output["snv"] or config.output["local"]: diff --git a/workflow/envs/report_sequences.yaml b/workflow/envs/report_sequences.yaml new file mode 100644 index 000000000..f389154ef --- /dev/null +++ b/workflow/envs/report_sequences.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - biopython diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0bdf1a7f9..758035382 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -6,9 +6,12 @@ __email__ = "v-pipe@bsse.ethz.ch" import csv import os +import pathlib import re import typing +from datetime import datetime, timezone + # for legacy import configparser import json # NOTE jsonschema is only part of the full snakemake, not snakemake-minimal @@ -374,6 +377,7 @@ for p in patient_list: references.append(os.path.join(sdir, "references/ref_")) consensus.append(os.path.join(sdir, "references/ref_ambig.fasta")) + consensus.append(os.path.join(sdir, "references/ref_ambig_dels.fasta")) consensus.append(os.path.join(sdir, "references/ref_majority.fasta")) consensus.append(os.path.join(sdir, "references/consensus.bcftools.fasta")) diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk index bdacd0c6e..c22e264e7 100644 --- a/workflow/rules/consensus.smk +++ b/workflow/rules/consensus.smk @@ -132,6 +132,8 @@ rule consensus_sequences: """ + + # QA checks performed on consensus_sequences # - do pairwise alignement rule consseq_QA: diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk new file mode 100644 index 000000000..da55005c3 --- /dev/null +++ b/workflow/rules/publish.smk @@ -0,0 +1,12 @@ +rule write_summary_json: + input: + lambda wildcard: [f for f in all_files if f.endswith("ref_ambig_dels.fasta")] + output: + "summary.json" + conda: + config.report_sequences["conda"] + shell: + # needed {CONDA_PREFIX} on my dev setup on Mac + pyenv: + """ + ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input} + """ diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 95bfb5da0..3862db80c 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1228,6 +1228,16 @@ }, "default": {}, "type": "object" + }, + "report_sequences": { + "properties": { + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/report_sequences.yaml" + } + }, + "default": {}, + "type": "object" } } } diff --git a/workflow/scripts/report_sequences.py b/workflow/scripts/report_sequences.py new file mode 100644 index 000000000..76c0b7f99 --- /dev/null +++ b/workflow/scripts/report_sequences.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +import hashlib +import pathlib +import json +import sys +from datetime import datetime, timezone + +from Bio import SeqIO +from Bio.SeqUtils.CheckSum import seguid, crc64 + +LOCAL_TIMEZONE = datetime.now(timezone.utc).astimezone().tzinfo + +results = [] + +for f in sys.argv[1:]: + f = pathlib.Path(f) + _, sample, batch, *_ = f.parts + + timestamp = datetime.fromtimestamp(f.stat().st_ctime, tz=LOCAL_TIMEZONE) + sequences = [] + for record in SeqIO.parse(f, "fasta"): + sequence = record.seq + sequences.append( + dict( + header=record.id, + seguid=seguid(sequence), + crc64=crc64(sequence), + sequence=str(sequence), + ) + ) + results.append( + dict(sample=sample, batch=batch, created=str(timestamp), sequences=sequences) + ) + +with open("summary.json", "w") as fh: + json.dump(results, fh, indent=4) From b7e04d34e265482ad8ff6235565ea8f5b39f90cb Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Tue, 26 Oct 2021 18:09:45 +0200 Subject: [PATCH 03/33] ran smakefmt --- workflow/Snakefile | 2 +- workflow/rules/common.smk | 1 + workflow/rules/consensus.smk | 2 -- workflow/rules/publish.smk | 4 ++-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 2655cd1b0..b76faa4ca 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,7 +19,7 @@ functions = srcdir("scripts/functions.sh") rule all: input: all_files, - "summary.json" + "summary.json", rule alltrimmed: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 758035382..39b1e67c3 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -252,6 +252,7 @@ def process_config(config): e.g. config.input instead of config["input"] so we don't have to change all existing rules. """ + def wrap(dd): if isinstance(dd, dict): dd = UserDict(dd) diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk index c22e264e7..bdacd0c6e 100644 --- a/workflow/rules/consensus.smk +++ b/workflow/rules/consensus.smk @@ -132,8 +132,6 @@ rule consensus_sequences: """ - - # QA checks performed on consensus_sequences # - do pairwise alignement rule consseq_QA: diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index da55005c3..2d3d31c83 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,8 +1,8 @@ rule write_summary_json: input: - lambda wildcard: [f for f in all_files if f.endswith("ref_ambig_dels.fasta")] + lambda wildcard: [f for f in all_files if f.endswith("ref_ambig_dels.fasta")], output: - "summary.json" + "summary.json", conda: config.report_sequences["conda"] shell: From d12c34f3db94584d7d779b83bfce4e0fe0e8513e Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Wed, 10 Nov 2021 17:35:26 +0100 Subject: [PATCH 04/33] first attempt --- workflow/Snakefile | 2 +- workflow/envs/dehuman.yaml | 7 ++ workflow/rules/common.smk | 9 +- workflow/rules/consensus.smk | 1 + workflow/rules/publish.smk | 151 +++++++++++++++++++++++++- workflow/schemas/config_schema.json | 24 ++++ workflow/scripts/remove_reads_list.pl | 29 +++++ workflow/scripts/report_sequences.py | 32 +++++- 8 files changed, 247 insertions(+), 8 deletions(-) create mode 100644 workflow/envs/dehuman.yaml create mode 100755 workflow/scripts/remove_reads_list.pl diff --git a/workflow/Snakefile b/workflow/Snakefile index b76faa4ca..3977af50c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,7 +19,7 @@ functions = srcdir("scripts/functions.sh") rule all: input: all_files, - "summary.json", + "summary.zip", rule alltrimmed: diff --git a/workflow/envs/dehuman.yaml b/workflow/envs/dehuman.yaml new file mode 100644 index 000000000..67e1e87a0 --- /dev/null +++ b/workflow/envs/dehuman.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bwa=0.7.17 + - samtools>1.11 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 39b1e67c3..07631fb5b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -365,6 +365,8 @@ visualizations = [] diversity_measures = [] datasets = [] IDs = [] +dehumanized_raw_reads = [] + for p in patient_list: # WARNING the following makes sure to gracefully handle trailing slashes in the user-provided paths in datadir sdir = os.path.join(config.output["datadir"], p.patient_id, p.date) @@ -436,8 +438,12 @@ for p in patient_list: visualizations.append(os.path.join(sdir, "visualization/snv_calling.html")) visualizations.append(os.path.join(sdir, "visualization/alignment.html")) + if config.output["dehumanized_raw_reads"]: + dehumanized_raw_reads.append(os.path.join(sdir, "raw_data", "dehuman.cram")) + # merge lists containing expected output - all_files = alignments + consensus + results + visualizations + all_files = (alignments + consensus + results + visualizations + + dehumanized_raw_reads) # diversity measures if not config.output["snv"] and config.output["diversity"]: @@ -528,6 +534,7 @@ def rebase_datadir(base, dataset): def construct_input_fastq(wildcards): + indir = os.path.join( rebase_datadir(config.input["datadir"], wildcards.dataset), "raw_data" ) diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk index bdacd0c6e..bc23d70e4 100644 --- a/workflow/rules/consensus.smk +++ b/workflow/rules/consensus.smk @@ -101,6 +101,7 @@ rule consensus_sequences: REF=reference_file, output: REF_amb="{dataset}/references/ref_ambig.fasta", + REF_amb_dels="{dataset}/references/ref_ambig_dels.fasta", REF_majority="{dataset}/references/ref_majority.fasta", REF_majority_dels="{dataset}/references/ref_majority_dels.fasta", params: diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index 2d3d31c83..be42d58ea 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,12 +1,157 @@ rule write_summary_json: input: - lambda wildcard: [f for f in all_files if f.endswith("ref_ambig_dels.fasta")], + consensus = lambda wildcard: [f for f in all_files if f.endswith("ref_ambig_dels.fasta")], + dehumanized = lambda wildcard: [f for f in all_files if f.endswith("dehuman.cram")], + #consensus=expand("{dataset}/references/ref_ambig_dels.fasta", dataset=datasets), + #raw_reads=expand("{dataset}/raw_data/dehuman.cram", dataset=datasets), output: - "summary.json", + "summary.zip", conda: config.report_sequences["conda"] shell: # needed {CONDA_PREFIX} on my dev setup on Mac + pyenv: """ - ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input} + set -x + ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input.consensus} + """ + + + +rule dehuman: + input: + global_ref=reference_file, + # R1=expand("{dataset}/preprocessed_data/R1.fastq.gz", dataset=datasets), + # R2=expand("{dataset}/preprocessed_data/R2.fastq.gz", dataset=datasets), + R1="{dataset}/preprocessed_data/R1.fastq.gz", + R2="{dataset}/preprocessed_data/R2.fastq.gz", + output: + final_cram="{dataset}/raw_data/dehuman.cram", + threads: config.dehuman["threads"] + params: + BWA=config.applications["bwa"], + SAMTOOLS=config.applications["samtools"], + HUMAN_GENOME=config.dehuman["ref_human"], + remove_reads=cachepath( + "../scripts/remove_reads_list.pl", executable=True, localsource=True + ), + ref_aln="{dataset}/ref_aln.sam", + filter="{dataset}/dehuman.filter", + h38_aln="{dataset}/h38_aln.sam", + h38_aln_cram="{dataset}/h38_aln.cram", + stats="{dataset}/raw_data/dehuman.stats", + # set to 1 to trigger matches with human genome (used for testing): + F=2, + reject_1="{dataset}/reject_R1.fastq.gz", + reject_2="{dataset}/reject_R2.fastq.gz", + filtered_1="{dataset}/raw_data/filtered_1.fasta.gz", + filtered_2="{dataset}/raw_data/filtered_2.fasta.gz", + cram_sam="{dataset}/raw_data/cram.sam", + conda: + config.dehuman["conda"] + shell: + """ + + echo Filter out SARS-CoV-2 reads ------------------------------------- + echo + + {params.BWA} mem -t {threads} \ + -o {params.ref_aln} \ + {input.global_ref} {input.R1} {input.R2} + + echo + echo Keep reject ----------------------------------------------------- + + {params.SAMTOOLS} bam2fq -@ {threads} \ + -F 2 \ + -1 {params.reject_1} \ + -2 {params.reject_2} \ + {params.ref_aln} + + echo + echo Checking rejects against Homo Sapiens --------------------------- + + {params.BWA} mem -t {threads} \ + -o {params.h38_aln}\ + {params.HUMAN_GENOME} {params.reject_1} {params.reject_2} + + echo + echo Count aligned reads --------------------------------------------- + + count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {params.h38_aln}) + + if (( count > 0 )); then + echo ----------------------------------------------------------------- + echo "Needs special care: $count potential human reads found" + echo ----------------------------------------------------------------- + echo + echo Removing identified human reads from raw reads ------------------ + echo + + # get list + {params.SAMTOOLS} view -@ {threads} \ + -f {params.F} \ + {params.h38_aln} \ + | cut -f 1 > {params.filter} + + zcat < {wildcards.dataset}/raw_data/*_R1.fastq.gz \ + | {params.remove_reads} {params.filter} \ + | gzip \ + > {params.filtered_1} & + + zcat < {wildcards.dataset}/raw_data/*_R2.fastq.gz \ + | {params.remove_reads} {params.filter} \ + | gzip \ + > {params.filtered_2} & + + wait + + # keep the rejects for further analysis + echo + echo Keeping Human-aligned SARS-CoV-2 rejects ------------------------- + + # (we compress reference-less, because the reference size is larger + # than the contaminant reads) + FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + {params.SAMTOOLS} sort -@ {threads} \ + -M \ + --output-fmt ${{FMT}} \ + -o {params.h38_aln_cram} \ + {params.h38_aln} + + echo + echo Compressing human-depleted raw reads ----------------------------- + + {params.SAMTOOLS} index -@ {threads} {params.h38_aln_cram} + {params.SAMTOOLS} stats -@ {threads} {params.h38_aln_cram} > {params.stats} + + else + echo + echo No potential human reads found ----------------------------------- + echo Copy raw reads file + echo + cp {wildcards.dataset}/raw_data/*_R1.fastq.gz {params.filtered_1} & + cp {wildcards.dataset}/raw_data/*_R2.fastq.gz {params.filtered_2} & + wait + fi + + echo + echo Compress filtered sequences -------------------------------------- + + {params.BWA} mem -t {threads} \ + -C \ + -o {params.cram_sam} \ + {input.global_ref} {params.filtered_1} {params.filtered_2} + + REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}}{{BC:Z:\\1}}\' + FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + + perl -p -e ${{REGEXP}} {params.cram_sam} \ + | {params.SAMTOOLS} sort -@ {threads} \ + -M \ + --reference {input.global_ref} \ + --output-fmt ${{FMT}} \ + -o {output.final_cram} + echo + echo DONE ------------------------------------------------------------- + echo """ diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 3862db80c..b95ea9cc6 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -174,6 +174,12 @@ "default": false, "description": "This option turns on the computation of diversity measures in each sample.", "examples": [true] + }, + "dehumanized_raw_reads": { + "type": "boolean", + "default": false, + "description": "This option turns on dehumanization of the raw reads", + "examples": [true] } }, "default": {}, @@ -1238,6 +1244,24 @@ }, "default": {}, "type": "object" + }, + "dehuman": { + "properties": { + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/dehuman.yaml" + }, + "threads": { + "type": "integer", + "default": 16 + }, + "ref_human": { + "type": "string", + "default": "references/human.fa" + } + }, + "default": {}, + "type": "object" } } } diff --git a/workflow/scripts/remove_reads_list.pl b/workflow/scripts/remove_reads_list.pl new file mode 100755 index 000000000..fa7c71b6e --- /dev/null +++ b/workflow/scripts/remove_reads_list.pl @@ -0,0 +1,29 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +# reads sequence ids from file from first command line argument +# then iterates over stdin and removes sequences for the given ids +# the sequences from stdin are organized in groups of three lines, the first line +# contains the sequence id + +my %h; # hash with 'headers' to filter-out as keys + +{ + my $listfile = shift @ARGV; + open my $fh, '<', $listfile + or die "cannot open ${listfile}"; + + # build a hash out of all (unique) seq ID + %h = map { chomp; ('@' . $_) => 1 } <$fh>; + + close $fh; +} +print STDERR "filtering out: ", scalar keys %h, " seq id\n"; + +while (<>) { # loop through headers... + my @s = map { scalar <> } 1 .. 3; # ...fetch the remainer 3 lines of read: bp sequence, separator, phred quality scores + print $_, @s + unless(exists $h{(split)[0]}); # consider header's first part ('@' + seq ID), ignore the second half ("comments", e.g.: Illumina indexes) +} diff --git a/workflow/scripts/report_sequences.py b/workflow/scripts/report_sequences.py index 76c0b7f99..f4675208d 100644 --- a/workflow/scripts/report_sequences.py +++ b/workflow/scripts/report_sequences.py @@ -1,17 +1,21 @@ #!/usr/bin/env python import hashlib -import pathlib import json +import pathlib +import shutil import sys +import tempfile +import zipfile from datetime import datetime, timezone from Bio import SeqIO -from Bio.SeqUtils.CheckSum import seguid, crc64 +from Bio.SeqUtils.CheckSum import crc64, seguid LOCAL_TIMEZONE = datetime.now(timezone.utc).astimezone().tzinfo results = [] +raw_data_files = [] for f in sys.argv[1:]: f = pathlib.Path(f) @@ -19,6 +23,14 @@ timestamp = datetime.fromtimestamp(f.stat().st_ctime, tz=LOCAL_TIMEZONE) sequences = [] + + dehuamized_raw_reads = f.parent.parent / "raw_data" / "dehuman.cram" + if dehuamized_raw_reads.exists(): + raw_data_files.append(dehuamized_raw_reads) + raw_data = str(dehuamized_raw_reads) + else: + raw_data = None + for record in SeqIO.parse(f, "fasta"): sequence = record.seq sequences.append( @@ -27,11 +39,25 @@ seguid=seguid(sequence), crc64=crc64(sequence), sequence=str(sequence), + raw_data=raw_data, ) ) results.append( dict(sample=sample, batch=batch, created=str(timestamp), sequences=sequences) ) -with open("summary.json", "w") as fh: + +folder = pathlib.Path(tempfile.mkdtemp()) + +with open(folder / "summary.json", "w") as fh: json.dump(results, fh, indent=4) + +for p in raw_data_files: + target = folder / p + target.parent.mkdir(parents=True) + print(p, target) + shutil.copy(p, target) + +with zipfile.ZipFile("summary.zip", "w", zipfile.ZIP_DEFLATED) as zip_file: + for entry in folder.rglob("*"): + zip_file.write(entry, entry.relative_to(folder)) From dc4d4f4d23859c9733a3563c57f7c32e28c16cfc Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Wed, 10 Nov 2021 17:50:14 +0100 Subject: [PATCH 05/33] create index of human genome if needed --- workflow/rules/publish.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index be42d58ea..db21330df 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -15,8 +15,6 @@ rule write_summary_json: ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input.consensus} """ - - rule dehuman: input: global_ref=reference_file, @@ -50,6 +48,8 @@ rule dehuman: config.dehuman["conda"] shell: """ + # create index if not exists: + test -f {params.HUMAN_GENOME}.bwt || {params.BWA} index {params.HUMAN_GENOME} echo Filter out SARS-CoV-2 reads ------------------------------------- echo @@ -70,6 +70,7 @@ rule dehuman: echo echo Checking rejects against Homo Sapiens --------------------------- + {params.BWA} mem -t {threads} \ -o {params.h38_aln}\ {params.HUMAN_GENOME} {params.reject_1} {params.reject_2} From b51c1872da7939ec95d44310268dd0da6de22496 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Tue, 14 Dec 2021 13:47:02 +0100 Subject: [PATCH 06/33] minor cleanup --- workflow/rules/common.smk | 1 + workflow/rules/publish.smk | 24 ++++++++++++++++-------- workflow/scripts/report_sequences.py | 9 +++++++-- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 07631fb5b..1bc1a9c83 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -382,6 +382,7 @@ for p in patient_list: consensus.append(os.path.join(sdir, "references/ref_ambig.fasta")) consensus.append(os.path.join(sdir, "references/ref_ambig_dels.fasta")) consensus.append(os.path.join(sdir, "references/ref_majority.fasta")) + consensus.append(os.path.join(sdir, "references/ref_majority_dels.fasta")) consensus.append(os.path.join(sdir, "references/consensus.bcftools.fasta")) diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index db21330df..965562657 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,9 +1,8 @@ rule write_summary_json: input: - consensus = lambda wildcard: [f for f in all_files if f.endswith("ref_ambig_dels.fasta")], + consensus = lambda wildcard: [f for f in all_files if f.endswith("ref_majority_dels.fasta")], dehumanized = lambda wildcard: [f for f in all_files if f.endswith("dehuman.cram")], - #consensus=expand("{dataset}/references/ref_ambig_dels.fasta", dataset=datasets), - #raw_reads=expand("{dataset}/raw_data/dehuman.cram", dataset=datasets), + all_files = all_files output: "summary.zip", conda: @@ -11,7 +10,6 @@ rule write_summary_json: shell: # needed {CONDA_PREFIX} on my dev setup on Mac + pyenv: """ - set -x ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input.consensus} """ @@ -29,7 +27,7 @@ rule dehuman: BWA=config.applications["bwa"], SAMTOOLS=config.applications["samtools"], HUMAN_GENOME=config.dehuman["ref_human"], - remove_reads=cachepath( + remove_reads_script=cachepath( "../scripts/remove_reads_list.pl", executable=True, localsource=True ), ref_aln="{dataset}/ref_aln.sam", @@ -60,6 +58,7 @@ rule dehuman: echo echo Keep reject ----------------------------------------------------- + echo {params.SAMTOOLS} bam2fq -@ {threads} \ -F 2 \ @@ -77,12 +76,14 @@ rule dehuman: echo echo Count aligned reads --------------------------------------------- + echo count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {params.h38_aln}) if (( count > 0 )); then + echo echo ----------------------------------------------------------------- - echo "Needs special care: $count potential human reads found" + echo "Needs special care: ${{count}} potential human reads found" echo ----------------------------------------------------------------- echo echo Removing identified human reads from raw reads ------------------ @@ -94,21 +95,26 @@ rule dehuman: {params.h38_aln} \ | cut -f 1 > {params.filter} + # using zcat FILENAME.gz causes issues on Mac, see + # https://serverfault.com/questions/570024/ + # redirection fixes this: zcat < {wildcards.dataset}/raw_data/*_R1.fastq.gz \ - | {params.remove_reads} {params.filter} \ + | {params.remove_reads_script} {params.filter} \ | gzip \ > {params.filtered_1} & zcat < {wildcards.dataset}/raw_data/*_R2.fastq.gz \ - | {params.remove_reads} {params.filter} \ + | {params.remove_reads_script} {params.filter} \ | gzip \ > {params.filtered_2} & wait # keep the rejects for further analysis + echo echo Keeping Human-aligned SARS-CoV-2 rejects ------------------------- + echo # (we compress reference-less, because the reference size is larger # than the contaminant reads) @@ -121,6 +127,7 @@ rule dehuman: echo echo Compressing human-depleted raw reads ----------------------------- + echo {params.SAMTOOLS} index -@ {threads} {params.h38_aln_cram} {params.SAMTOOLS} stats -@ {threads} {params.h38_aln_cram} > {params.stats} @@ -137,6 +144,7 @@ rule dehuman: echo echo Compress filtered sequences -------------------------------------- + echo {params.BWA} mem -t {threads} \ -C \ diff --git a/workflow/scripts/report_sequences.py b/workflow/scripts/report_sequences.py index f4675208d..f8d9f975e 100644 --- a/workflow/scripts/report_sequences.py +++ b/workflow/scripts/report_sequences.py @@ -39,11 +39,16 @@ seguid=seguid(sequence), crc64=crc64(sequence), sequence=str(sequence), - raw_data=raw_data, ) ) results.append( - dict(sample=sample, batch=batch, created=str(timestamp), sequences=sequences) + dict( + sample=sample, + batch=batch, + created=str(timestamp), + sequences=sequences, + raw_data=raw_data, + ) ) From 8184d2f4ccd8c52f2bf6fb18331e4bfc2e5dcb11 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Tue, 26 Oct 2021 18:09:45 +0200 Subject: [PATCH 07/33] ran smakefmt --- workflow/Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 3977af50c..52bc8e2f0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -21,7 +21,6 @@ rule all: all_files, "summary.zip", - rule alltrimmed: input: trimmed_files, From ccdf01eb5365d1090a33ce879f889c7bd5fc14c1 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Mon, 17 Jan 2022 21:21:56 +0100 Subject: [PATCH 08/33] dehuman: use incoming raw data files + tempdirs for intermediate files --- workflow/rules/publish.smk | 89 ++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index 965562657..2a02dac0d 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,3 +1,7 @@ +from functools import partial +from glob import glob + + rule write_summary_json: input: consensus = lambda wildcard: [f for f in all_files if f.endswith("ref_majority_dels.fasta")], @@ -13,15 +17,34 @@ rule write_summary_json: ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input.consensus} """ + +def raw_data_file(wildcards, pair): + for p in os.listdir('{dataset}/raw_data'.format(dataset=wildcards.dataset)): + if re.search(r".*R{pair}\.(fastq\.gz|fastq|fq|fq\.gz)$".format(pair=pair), p): + return os.path.join(wildcards.dataset, "raw_data", p) + + +def temp_with_prefix(p): + return os.path.join(config.general["temp_prefix"], p) + + + rule dehuman: input: global_ref=reference_file, - # R1=expand("{dataset}/preprocessed_data/R1.fastq.gz", dataset=datasets), - # R2=expand("{dataset}/preprocessed_data/R2.fastq.gz", dataset=datasets), - R1="{dataset}/preprocessed_data/R1.fastq.gz", - R2="{dataset}/preprocessed_data/R2.fastq.gz", + R1=partial(raw_data_file, pair=1), + R2=partial(raw_data_file, pair=2), output: final_cram="{dataset}/raw_data/dehuman.cram", + ref_aln=temp_with_prefix("{dataset}/ref_aln.sam"), + filter=temp_with_prefix("{dataset}/dehuman.filter"), + filtered_1=temp_with_prefix("{dataset}/raw_data/filtered_1.fasta.gz"), + filtered_2=temp_with_prefix("{dataset}/raw_data/filtered_2.fasta.gz"), + reject_1=temp_with_prefix("{dataset}/reject_R1.fastq.gz"), + reject_2=temp_with_prefix("{dataset}/reject_R2.fastq.gz"), + h38_aln=temp_with_prefix("{dataset}/h38_aln.sam"), + h38_aln_cram=temp_with_prefix("{dataset}/h38_aln.cram"), + cram_sam=temp_with_prefix("{dataset}/raw_data/cram.sam"), threads: config.dehuman["threads"] params: BWA=config.applications["bwa"], @@ -30,18 +53,9 @@ rule dehuman: remove_reads_script=cachepath( "../scripts/remove_reads_list.pl", executable=True, localsource=True ), - ref_aln="{dataset}/ref_aln.sam", - filter="{dataset}/dehuman.filter", - h38_aln="{dataset}/h38_aln.sam", - h38_aln_cram="{dataset}/h38_aln.cram", stats="{dataset}/raw_data/dehuman.stats", # set to 1 to trigger matches with human genome (used for testing): - F=2, - reject_1="{dataset}/reject_R1.fastq.gz", - reject_2="{dataset}/reject_R2.fastq.gz", - filtered_1="{dataset}/raw_data/filtered_1.fasta.gz", - filtered_2="{dataset}/raw_data/filtered_2.fasta.gz", - cram_sam="{dataset}/raw_data/cram.sam", + F=2 conda: config.dehuman["conda"] shell: @@ -53,7 +67,7 @@ rule dehuman: echo {params.BWA} mem -t {threads} \ - -o {params.ref_aln} \ + -o {output.ref_aln} \ {input.global_ref} {input.R1} {input.R2} echo @@ -62,23 +76,22 @@ rule dehuman: {params.SAMTOOLS} bam2fq -@ {threads} \ -F 2 \ - -1 {params.reject_1} \ - -2 {params.reject_2} \ - {params.ref_aln} + -1 {output.reject_1} \ + -2 {output.reject_2} \ + {output.ref_aln} echo echo Checking rejects against Homo Sapiens --------------------------- - {params.BWA} mem -t {threads} \ - -o {params.h38_aln}\ - {params.HUMAN_GENOME} {params.reject_1} {params.reject_2} + -o {output.h38_aln}\ + {params.HUMAN_GENOME} {output.reject_1} {output.reject_2} echo echo Count aligned reads --------------------------------------------- echo - count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {params.h38_aln}) + count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {output.h38_aln}) if (( count > 0 )); then echo @@ -92,21 +105,21 @@ rule dehuman: # get list {params.SAMTOOLS} view -@ {threads} \ -f {params.F} \ - {params.h38_aln} \ - | cut -f 1 > {params.filter} + {output.h38_aln} \ + | cut -f 1 > {output.filter} # using zcat FILENAME.gz causes issues on Mac, see # https://serverfault.com/questions/570024/ # redirection fixes this: zcat < {wildcards.dataset}/raw_data/*_R1.fastq.gz \ - | {params.remove_reads_script} {params.filter} \ + | {params.remove_reads_script} {output.filter} \ | gzip \ - > {params.filtered_1} & + > {output.filtered_1} & zcat < {wildcards.dataset}/raw_data/*_R2.fastq.gz \ - | {params.remove_reads_script} {params.filter} \ + | {params.remove_reads_script} {output.filter} \ | gzip \ - > {params.filtered_2} & + > {output.filtered_2} & wait @@ -122,24 +135,26 @@ rule dehuman: {params.SAMTOOLS} sort -@ {threads} \ -M \ --output-fmt ${{FMT}} \ - -o {params.h38_aln_cram} \ - {params.h38_aln} + -o {output.h38_aln_cram} \ + {output.h38_aln} echo echo Compressing human-depleted raw reads ----------------------------- echo - {params.SAMTOOLS} index -@ {threads} {params.h38_aln_cram} - {params.SAMTOOLS} stats -@ {threads} {params.h38_aln_cram} > {params.stats} + {params.SAMTOOLS} index -@ {threads} {output.h38_aln_cram} + {params.SAMTOOLS} stats -@ {threads} {output.h38_aln_cram} > {params.stats} else echo echo No potential human reads found ----------------------------------- echo Copy raw reads file echo - cp {wildcards.dataset}/raw_data/*_R1.fastq.gz {params.filtered_1} & - cp {wildcards.dataset}/raw_data/*_R2.fastq.gz {params.filtered_2} & + cp {wildcards.dataset}/raw_data/*_R1.fastq.gz {output.filtered_1} & + cp {wildcards.dataset}/raw_data/*_R2.fastq.gz {output.filtered_2} & wait + touch {output.filter} + touch {output.h38_aln_cram} fi echo @@ -148,13 +163,13 @@ rule dehuman: {params.BWA} mem -t {threads} \ -C \ - -o {params.cram_sam} \ - {input.global_ref} {params.filtered_1} {params.filtered_2} + -o {output.cram_sam} \ + {input.global_ref} {output.filtered_1} {output.filtered_2} REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}}{{BC:Z:\\1}}\' FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 - perl -p -e ${{REGEXP}} {params.cram_sam} \ + perl -p -e ${{REGEXP}} {output.cram_sam} \ | {params.SAMTOOLS} sort -@ {threads} \ -M \ --reference {input.global_ref} \ From ea9fb1b6f55b0558bf92acb2bc5da14085d8bdc2 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Mon, 17 Jan 2022 21:25:57 +0100 Subject: [PATCH 09/33] moved dehuman task to separate file --- workflow/Snakefile | 1 + workflow/rules/dehuman.smk | 165 ++++++++++++++++++++++++++++++++++++ workflow/rules/publish.smk | 166 ------------------------------------- 3 files changed, 166 insertions(+), 166 deletions(-) create mode 100644 workflow/rules/dehuman.smk diff --git a/workflow/Snakefile b/workflow/Snakefile index 52bc8e2f0..402016f0d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -38,6 +38,7 @@ include: "rules/consensus.smk" include: "rules/mafs.smk" include: "rules/stats.smk" include: "rules/publish.smk" +include: "rules/dehuman.smk" if config.output["snv"] or config.output["local"]: diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk new file mode 100644 index 000000000..34281dd1d --- /dev/null +++ b/workflow/rules/dehuman.smk @@ -0,0 +1,165 @@ +from functools import partial +from glob import glob + + +def raw_data_file(wildcards, pair): + for p in os.listdir('{dataset}/raw_data'.format(dataset=wildcards.dataset)): + if re.search(r".*R{pair}\.(fastq\.gz|fastq|fq|fq\.gz)$".format(pair=pair), p): + return os.path.join(wildcards.dataset, "raw_data", p) + + +def temp_with_prefix(p): + return os.path.join(config.general["temp_prefix"], p) + + + +rule dehuman: + input: + global_ref=reference_file, + R1=partial(raw_data_file, pair=1), + R2=partial(raw_data_file, pair=2), + output: + final_cram="{dataset}/raw_data/dehuman.cram", + ref_aln=temp_with_prefix("{dataset}/ref_aln.sam"), + filter=temp_with_prefix("{dataset}/dehuman.filter"), + filtered_1=temp_with_prefix("{dataset}/raw_data/filtered_1.fasta.gz"), + filtered_2=temp_with_prefix("{dataset}/raw_data/filtered_2.fasta.gz"), + reject_1=temp_with_prefix("{dataset}/reject_R1.fastq.gz"), + reject_2=temp_with_prefix("{dataset}/reject_R2.fastq.gz"), + h38_aln=temp_with_prefix("{dataset}/h38_aln.sam"), + h38_aln_cram=temp_with_prefix("{dataset}/h38_aln.cram"), + cram_sam=temp_with_prefix("{dataset}/raw_data/cram.sam"), + threads: config.dehuman["threads"] + params: + BWA=config.applications["bwa"], + SAMTOOLS=config.applications["samtools"], + HUMAN_GENOME=config.dehuman["ref_human"], + remove_reads_script=cachepath( + "../scripts/remove_reads_list.pl", executable=True, localsource=True + ), + stats="{dataset}/raw_data/dehuman.stats", + # set to 1 to trigger matches with human genome (used for testing): + F=2 + conda: + config.dehuman["conda"] + shell: + """ + # create index if not exists: + test -f {params.HUMAN_GENOME}.bwt || {params.BWA} index {params.HUMAN_GENOME} + + echo Filter out SARS-CoV-2 reads ------------------------------------- + echo + + {params.BWA} mem -t {threads} \ + -o {output.ref_aln} \ + {input.global_ref} {input.R1} {input.R2} + + echo + echo Keep reject ----------------------------------------------------- + echo + + {params.SAMTOOLS} bam2fq -@ {threads} \ + -F 2 \ + -1 {output.reject_1} \ + -2 {output.reject_2} \ + {output.ref_aln} + + echo + echo Checking rejects against Homo Sapiens --------------------------- + + {params.BWA} mem -t {threads} \ + -o {output.h38_aln}\ + {params.HUMAN_GENOME} {output.reject_1} {output.reject_2} + + echo + echo Count aligned reads --------------------------------------------- + echo + + count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {output.h38_aln}) + + if (( count > 0 )); then + echo + echo ----------------------------------------------------------------- + echo "Needs special care: ${{count}} potential human reads found" + echo ----------------------------------------------------------------- + echo + echo Removing identified human reads from raw reads ------------------ + echo + + # get list + {params.SAMTOOLS} view -@ {threads} \ + -f {params.F} \ + {output.h38_aln} \ + | cut -f 1 > {output.filter} + + # using zcat FILENAME.gz causes issues on Mac, see + # https://serverfault.com/questions/570024/ + # redirection fixes this: + zcat < {wildcards.dataset}/raw_data/*_R1.fastq.gz \ + | {params.remove_reads_script} {output.filter} \ + | gzip \ + > {output.filtered_1} & + + zcat < {wildcards.dataset}/raw_data/*_R2.fastq.gz \ + | {params.remove_reads_script} {output.filter} \ + | gzip \ + > {output.filtered_2} & + + wait + + # keep the rejects for further analysis + + echo + echo Keeping Human-aligned SARS-CoV-2 rejects ------------------------- + echo + + # (we compress reference-less, because the reference size is larger + # than the contaminant reads) + FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + {params.SAMTOOLS} sort -@ {threads} \ + -M \ + --output-fmt ${{FMT}} \ + -o {output.h38_aln_cram} \ + {output.h38_aln} + + echo + echo Compressing human-depleted raw reads ----------------------------- + echo + + {params.SAMTOOLS} index -@ {threads} {output.h38_aln_cram} + {params.SAMTOOLS} stats -@ {threads} {output.h38_aln_cram} > {params.stats} + + else + echo + echo No potential human reads found ----------------------------------- + echo Copy raw reads file + echo + cp {wildcards.dataset}/raw_data/*_R1.fastq.gz {output.filtered_1} & + cp {wildcards.dataset}/raw_data/*_R2.fastq.gz {output.filtered_2} & + wait + touch {output.filter} + touch {output.h38_aln_cram} + fi + + echo + echo Compress filtered sequences -------------------------------------- + echo + + {params.BWA} mem -t {threads} \ + -C \ + -o {output.cram_sam} \ + {input.global_ref} {output.filtered_1} {output.filtered_2} + + REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}}{{BC:Z:\\1}}\' + FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + + perl -p -e ${{REGEXP}} {output.cram_sam} \ + | {params.SAMTOOLS} sort -@ {threads} \ + -M \ + --reference {input.global_ref} \ + --output-fmt ${{FMT}} \ + -o {output.final_cram} + echo + echo DONE ------------------------------------------------------------- + echo + """ diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index 2a02dac0d..ba8129c17 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,6 +1,3 @@ -from functools import partial -from glob import glob - rule write_summary_json: input: @@ -16,166 +13,3 @@ rule write_summary_json: """ ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input.consensus} """ - - -def raw_data_file(wildcards, pair): - for p in os.listdir('{dataset}/raw_data'.format(dataset=wildcards.dataset)): - if re.search(r".*R{pair}\.(fastq\.gz|fastq|fq|fq\.gz)$".format(pair=pair), p): - return os.path.join(wildcards.dataset, "raw_data", p) - - -def temp_with_prefix(p): - return os.path.join(config.general["temp_prefix"], p) - - - -rule dehuman: - input: - global_ref=reference_file, - R1=partial(raw_data_file, pair=1), - R2=partial(raw_data_file, pair=2), - output: - final_cram="{dataset}/raw_data/dehuman.cram", - ref_aln=temp_with_prefix("{dataset}/ref_aln.sam"), - filter=temp_with_prefix("{dataset}/dehuman.filter"), - filtered_1=temp_with_prefix("{dataset}/raw_data/filtered_1.fasta.gz"), - filtered_2=temp_with_prefix("{dataset}/raw_data/filtered_2.fasta.gz"), - reject_1=temp_with_prefix("{dataset}/reject_R1.fastq.gz"), - reject_2=temp_with_prefix("{dataset}/reject_R2.fastq.gz"), - h38_aln=temp_with_prefix("{dataset}/h38_aln.sam"), - h38_aln_cram=temp_with_prefix("{dataset}/h38_aln.cram"), - cram_sam=temp_with_prefix("{dataset}/raw_data/cram.sam"), - threads: config.dehuman["threads"] - params: - BWA=config.applications["bwa"], - SAMTOOLS=config.applications["samtools"], - HUMAN_GENOME=config.dehuman["ref_human"], - remove_reads_script=cachepath( - "../scripts/remove_reads_list.pl", executable=True, localsource=True - ), - stats="{dataset}/raw_data/dehuman.stats", - # set to 1 to trigger matches with human genome (used for testing): - F=2 - conda: - config.dehuman["conda"] - shell: - """ - # create index if not exists: - test -f {params.HUMAN_GENOME}.bwt || {params.BWA} index {params.HUMAN_GENOME} - - echo Filter out SARS-CoV-2 reads ------------------------------------- - echo - - {params.BWA} mem -t {threads} \ - -o {output.ref_aln} \ - {input.global_ref} {input.R1} {input.R2} - - echo - echo Keep reject ----------------------------------------------------- - echo - - {params.SAMTOOLS} bam2fq -@ {threads} \ - -F 2 \ - -1 {output.reject_1} \ - -2 {output.reject_2} \ - {output.ref_aln} - - echo - echo Checking rejects against Homo Sapiens --------------------------- - - {params.BWA} mem -t {threads} \ - -o {output.h38_aln}\ - {params.HUMAN_GENOME} {output.reject_1} {output.reject_2} - - echo - echo Count aligned reads --------------------------------------------- - echo - - count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {output.h38_aln}) - - if (( count > 0 )); then - echo - echo ----------------------------------------------------------------- - echo "Needs special care: ${{count}} potential human reads found" - echo ----------------------------------------------------------------- - echo - echo Removing identified human reads from raw reads ------------------ - echo - - # get list - {params.SAMTOOLS} view -@ {threads} \ - -f {params.F} \ - {output.h38_aln} \ - | cut -f 1 > {output.filter} - - # using zcat FILENAME.gz causes issues on Mac, see - # https://serverfault.com/questions/570024/ - # redirection fixes this: - zcat < {wildcards.dataset}/raw_data/*_R1.fastq.gz \ - | {params.remove_reads_script} {output.filter} \ - | gzip \ - > {output.filtered_1} & - - zcat < {wildcards.dataset}/raw_data/*_R2.fastq.gz \ - | {params.remove_reads_script} {output.filter} \ - | gzip \ - > {output.filtered_2} & - - wait - - # keep the rejects for further analysis - - echo - echo Keeping Human-aligned SARS-CoV-2 rejects ------------------------- - echo - - # (we compress reference-less, because the reference size is larger - # than the contaminant reads) - FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 - {params.SAMTOOLS} sort -@ {threads} \ - -M \ - --output-fmt ${{FMT}} \ - -o {output.h38_aln_cram} \ - {output.h38_aln} - - echo - echo Compressing human-depleted raw reads ----------------------------- - echo - - {params.SAMTOOLS} index -@ {threads} {output.h38_aln_cram} - {params.SAMTOOLS} stats -@ {threads} {output.h38_aln_cram} > {params.stats} - - else - echo - echo No potential human reads found ----------------------------------- - echo Copy raw reads file - echo - cp {wildcards.dataset}/raw_data/*_R1.fastq.gz {output.filtered_1} & - cp {wildcards.dataset}/raw_data/*_R2.fastq.gz {output.filtered_2} & - wait - touch {output.filter} - touch {output.h38_aln_cram} - fi - - echo - echo Compress filtered sequences -------------------------------------- - echo - - {params.BWA} mem -t {threads} \ - -C \ - -o {output.cram_sam} \ - {input.global_ref} {output.filtered_1} {output.filtered_2} - - REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}}{{BC:Z:\\1}}\' - FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 - - perl -p -e ${{REGEXP}} {output.cram_sam} \ - | {params.SAMTOOLS} sort -@ {threads} \ - -M \ - --reference {input.global_ref} \ - --output-fmt ${{FMT}} \ - -o {output.final_cram} - echo - echo DONE ------------------------------------------------------------- - echo - """ From 8c02c1a410bdfdcfd3426566599d51e6bf173444 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 21 Jan 2022 15:49:55 +0100 Subject: [PATCH 10/33] Code reuse: Temp prefix function in common --- workflow/rules/align.smk | 61 +++++----------------------- workflow/rules/common.smk | 11 +++++ workflow/rules/dehuman.smk | 14 ++----- workflow/rules/quality_assurance.smk | 2 + 4 files changed, 26 insertions(+), 62 deletions(-) diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 3242ff88c..cc3ffe98d 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -267,12 +267,7 @@ if config.general["aligner"] == "ngshmmalign": input: "{dataset}/preprocessed_data/{file}.fastq.gz", output: - temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/preprocessed_data/{file}.fastq", - ) - ), + temp_with_prefix("{dataset}/preprocessed_data/{file}.fastq"), params: GUNZIP=config.applications["gunzip"], log: @@ -295,16 +290,8 @@ if config.general["aligner"] == "ngshmmalign": initial_ref="{dataset}/references/initial_consensus.fasta", FASTQ=input_align, output: - good_aln=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/full_aln.sam" - ) - ), - reject_aln=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/rejects.sam" - ) - ), + good_aln=temp_with_prefix("{dataset}/alignments/full_aln.sam"), + reject_aln=temp_with_prefix("{dataset}/alignments/rejects.sam"), REF_ambig="{dataset}/references/ref_ambig.fasta", REF_majority="{dataset}/references/ref_majority.fasta", params: @@ -426,7 +413,7 @@ if config.general["aligner"] == "ngshmmalign": rule sam2bam: input: - os.path.join(config.general["temp_prefix"], "{file}.sam"), + temp_prefix("{file}.sam"), output: # TODO support cram here BAM="{file}.bam", @@ -502,16 +489,8 @@ if config.general["aligner"] == "bwa": REF=reference_file, INDEX="{}.bwt".format(reference_file), output: - REF=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/REF_aln.sam" - ) - ), - TMP_SAM=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/tmp_aln.sam" - ) - ), + REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), + TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), params: EXTRA=config.bwa_align["extra"], FILTER="-f 2" if config.input["paired"] else "-F 4", @@ -588,18 +567,8 @@ elif config.general["aligner"] == "bowtie": INDEX5="{}.rev.1.bt2".format(reference_file), INDEX6="{}.rev.2.bt2".format(reference_file), output: - REF=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/REF_aln.sam", - ) - ), - TMP_SAM=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/tmp_aln.sam", - ) - ), + REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), + TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), params: PHRED=config.bowtie_align["phred"], PRESET=config.bowtie_align["preset"], @@ -643,18 +612,8 @@ elif config.general["aligner"] == "bowtie": INDEX5="{}.rev.1.bt2".format(reference_file), INDEX6="{}.rev.2.bt2".format(reference_file), output: - REF=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/REF_aln.sam", - ) - ), - TMP_SAM=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/tmp_aln.sam", - ) - ), + REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), + TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), params: PHRED=config.bowtie_align["phred"], PRESET=config.bowtie_align["preset"], diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1bc1a9c83..7f696951c 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -533,7 +533,12 @@ def get_maxins(wildcards): def rebase_datadir(base, dataset): return os.path.join(base, *os.path.normpath(dataset).split(os.path.sep)[-2:]) +def raw_data_file(wildcards, pair): + for p in os.listdir('{dataset}/raw_data'.format(dataset=wildcards.dataset)): + if re.search(r".*R{pair}\.(fastq\.gz|fastq|fq|fq\.gz)$".format(pair=pair), p): + return os.path.join(wildcards.dataset, "raw_data", p) +# TODO replace with raw_data_file def construct_input_fastq(wildcards): indir = os.path.join( @@ -581,3 +586,9 @@ def construct_input_fastq(wildcards): ) return list_output + +def temp_prefix(p): + return temp(os.path.join(config.general["temp_prefix"], p)) + +def temp_with_prefix(p): + return temp(temp_prefix(p)) diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 34281dd1d..07cee9bb9 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -2,16 +2,8 @@ from functools import partial from glob import glob -def raw_data_file(wildcards, pair): - for p in os.listdir('{dataset}/raw_data'.format(dataset=wildcards.dataset)): - if re.search(r".*R{pair}\.(fastq\.gz|fastq|fq|fq\.gz)$".format(pair=pair), p): - return os.path.join(wildcards.dataset, "raw_data", p) - - -def temp_with_prefix(p): - return os.path.join(config.general["temp_prefix"], p) - - +# TODO split into multiple sub-rules +# TODO move cram compression into align.smk rule dehuman: input: @@ -137,7 +129,7 @@ rule dehuman: cp {wildcards.dataset}/raw_data/*_R1.fastq.gz {output.filtered_1} & cp {wildcards.dataset}/raw_data/*_R2.fastq.gz {output.filtered_2} & wait - touch {output.filter} + touch {output.filter} touch {output.h38_aln_cram} fi diff --git a/workflow/rules/quality_assurance.smk b/workflow/rules/quality_assurance.smk index 125419eca..3c28bead8 100644 --- a/workflow/rules/quality_assurance.smk +++ b/workflow/rules/quality_assurance.smk @@ -32,6 +32,7 @@ rule gunzip: rule extract: input: construct_input_fastq, + # TODO replace with raw_data_file output: temp( os.path.join( @@ -54,6 +55,7 @@ rule extract: """ cat {input} | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > {output} 2> >(tee {log.errfile} >&2) """ + # TODO replace with better dedicated software # 2. clipping From 511e97f220d59e9493f7898790f65e63b8481b98 Mon Sep 17 00:00:00 2001 From: Uwe Schmitt Date: Fri, 28 Jan 2022 14:53:38 +0100 Subject: [PATCH 11/33] new strategy: prepare upload folder --- workflow/Snakefile | 3 +-- workflow/rules/common.smk | 8 +++---- workflow/rules/dehuman.smk | 1 - workflow/rules/publish.smk | 44 ++++++++++++++++++++++++++++++-------- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 402016f0d..b54637e2c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -18,8 +18,7 @@ functions = srcdir("scripts/functions.sh") # DUMMY RULES rule all: input: - all_files, - "summary.zip", + all_files rule alltrimmed: input: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7f696951c..e43243ff4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -365,7 +365,7 @@ visualizations = [] diversity_measures = [] datasets = [] IDs = [] -dehumanized_raw_reads = [] +upload_markers = [] for p in patient_list: # WARNING the following makes sure to gracefully handle trailing slashes in the user-provided paths in datadir @@ -439,12 +439,10 @@ for p in patient_list: visualizations.append(os.path.join(sdir, "visualization/snv_calling.html")) visualizations.append(os.path.join(sdir, "visualization/alignment.html")) - if config.output["dehumanized_raw_reads"]: - dehumanized_raw_reads.append(os.path.join(sdir, "raw_data", "dehuman.cram")) + upload_markers.append(os.path.join(sdir, "upload_prepared.touch")) # merge lists containing expected output - all_files = (alignments + consensus + results + visualizations - + dehumanized_raw_reads) + all_files = (alignments + consensus + results + visualizations + upload_markers) # diversity measures if not config.output["snv"] and config.output["diversity"]: diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 07cee9bb9..e151839d9 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -1,5 +1,4 @@ from functools import partial -from glob import glob # TODO split into multiple sub-rules diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index ba8129c17..14d45cd63 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,15 +1,41 @@ +localrules: + prepare_upload -rule write_summary_json: + +rule prepare_upload: input: - consensus = lambda wildcard: [f for f in all_files if f.endswith("ref_majority_dels.fasta")], - dehumanized = lambda wildcard: [f for f in all_files if f.endswith("dehuman.cram")], - all_files = all_files + R1=partial(raw_data_file, pair=1), + R2=partial(raw_data_file, pair=2), + consensus_aligned="{dataset}/references/ref_majority_dels.fasta", + consensus_unaligned="{dataset}/references/consensus_ambig.bcftools.fasta", output: - "summary.zip", - conda: - config.report_sequences["conda"] + upload_prepared_touch="{dataset}/upload_prepared.touch" + threads: 1 + shell: - # needed {CONDA_PREFIX} on my dev setup on Mac + pyenv: """ - ${{CONDA_PREFIX}}/bin/python {VPIPE_BASEDIR}/scripts/report_sequences.py {input.consensus} + to_upload=({input}) + + # depending on config dehuman might not exist, this is why we + # did not add it to the input fields: + to_upload+=({wildcards.dataset}/raw_data/dehuman.cram) + + for p in ${{to_upload[@]}}; do + test -e $p || continue + fixed_p=$(realpath --relative-to {wildcards.dataset}/uploads $p) + ( set -x; ln -f -s $fixed_p {wildcards.dataset}/uploads ) + done + + mkdir -p uploads + + sample_id=$(basename $(dirname {wildcards.dataset})) + fixed_uploads=$(realpath --relative-to uploads {wildcards.dataset}/uploads) + + # make unique symbolic link: + random=$(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b | cut -d" " -f 1) + unique_id=${{sample_id}}__${{random}} + + ( set -x; ln -s $fixed_uploads uploads/$unique_id ) + + touch {output.upload_prepared_touch} """ From 664dc860c003973292d54dba911180aea521cfae Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Thu, 17 Feb 2022 19:42:35 +0100 Subject: [PATCH 12/33] Splitting dehuman into sub-rules --- workflow/envs/dehuman.yaml | 2 + workflow/rules/align.smk | 52 +++--- workflow/rules/common.smk | 12 +- workflow/rules/dehuman.smk | 259 +++++++++++++++++++--------- workflow/schemas/config_schema.json | 27 ++- 5 files changed, 242 insertions(+), 110 deletions(-) diff --git a/workflow/envs/dehuman.yaml b/workflow/envs/dehuman.yaml index 67e1e87a0..5d9ef3120 100644 --- a/workflow/envs/dehuman.yaml +++ b/workflow/envs/dehuman.yaml @@ -5,3 +5,5 @@ channels: dependencies: - bwa=0.7.17 - samtools>1.11 + - xxhash + - coreutils # [not linux] diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index cc3ffe98d..61414618a 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -443,6 +443,32 @@ rule sam2bam: """ +rule ref_bwa_index: + input: + reference_file, + output: + "{}.bwt".format(reference_file), + params: + BWA=config.applications["bwa"], + log: + outfile="references/bwa_index.out.log", + errfile="references/bwa_index.err.log", + conda: + config.ref_bwa_index["conda"] + benchmark: + "references/ref_bwa_index.benchmark" + group: + "align" + resources: + disk_mb=1250, + mem_mb=config.ref_bwa_index["mem"], + time_min=config.ref_bwa_index["time"], + threads: 1 + shell: + """ + {params.BWA} index {input} 2> >(tee {log.errfile} >&2) + """ + if config.general["aligner"] == "bwa": def input_align_gz(wildcards): @@ -456,32 +482,6 @@ if config.general["aligner"] == "bwa": ) return list_output - rule ref_bwa_index: - input: - reference_file, - output: - "{}.bwt".format(reference_file), - params: - BWA=config.applications["bwa"], - log: - outfile="references/bwa_index.out.log", - errfile="references/bwa_index.err.log", - conda: - config.ref_bwa_index["conda"] - benchmark: - "references/ref_bwa_index.benchmark" - group: - "align" - resources: - disk_mb=1250, - mem_mb=config.ref_bwa_index["mem"], - time_min=config.ref_bwa_index["time"], - threads: 1 - shell: - """ - {params.BWA} index {input} 2> >(tee {log.errfile} >&2) - """ - rule bwa_align: # all indexing files: .amb .ann .bwt .fai .pac .sa input: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e43243ff4..7bbc2988d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -365,6 +365,7 @@ visualizations = [] diversity_measures = [] datasets = [] IDs = [] +dehumanized_raw_reads = [] upload_markers = [] for p in patient_list: @@ -439,10 +440,15 @@ for p in patient_list: visualizations.append(os.path.join(sdir, "visualization/snv_calling.html")) visualizations.append(os.path.join(sdir, "visualization/alignment.html")) - upload_markers.append(os.path.join(sdir, "upload_prepared.touch")) + if config.output["dehumanized_raw_reads"]: + dehumanized_raw_reads.append(os.path.join(sdir, "raw_uploads", "dehuman.cram")) + + if config.output["upload"]: + upload_markers.append(os.path.join(sdir, "upload_prepared.touch")) # merge lists containing expected output - all_files = (alignments + consensus + results + visualizations + upload_markers) + all_files = (alignments + consensus + results + visualizations + + dehumanized_raw_reads + upload_markers) # diversity measures if not config.output["snv"] and config.output["diversity"]: @@ -586,7 +592,7 @@ def construct_input_fastq(wildcards): return list_output def temp_prefix(p): - return temp(os.path.join(config.general["temp_prefix"], p)) + return os.path.join(config.general["temp_prefix"], p) def temp_with_prefix(p): return temp(temp_prefix(p)) diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index e151839d9..6926b08ee 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -1,146 +1,246 @@ from functools import partial -# TODO split into multiple sub-rules -# TODO move cram compression into align.smk - -rule dehuman: +rule dh_redo_alignreject: + # this rule re-does an alignment to get the reject (useful if aligner's (temp) rejects have been deleted by now) + # TODO add support for rejected reads in align.smk (e.g. ngshmmalign already does /alignments/rejects.sam) input: global_ref=reference_file, - R1=partial(raw_data_file, pair=1), - R2=partial(raw_data_file, pair=2), + ref_index="{}.bwt".format(reference_file), + fastq=input_align_gz, output: - final_cram="{dataset}/raw_data/dehuman.cram", - ref_aln=temp_with_prefix("{dataset}/ref_aln.sam"), - filter=temp_with_prefix("{dataset}/dehuman.filter"), - filtered_1=temp_with_prefix("{dataset}/raw_data/filtered_1.fasta.gz"), - filtered_2=temp_with_prefix("{dataset}/raw_data/filtered_2.fasta.gz"), - reject_1=temp_with_prefix("{dataset}/reject_R1.fastq.gz"), - reject_2=temp_with_prefix("{dataset}/reject_R2.fastq.gz"), - h38_aln=temp_with_prefix("{dataset}/h38_aln.sam"), - h38_aln_cram=temp_with_prefix("{dataset}/h38_aln.cram"), - cram_sam=temp_with_prefix("{dataset}/raw_data/cram.sam"), - threads: config.dehuman["threads"] + tmp_aln=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), + # TODO add option to switch between "redo rejects" and "rely on aligner's output". + reject_1=temp_with_prefix("{dataset}/alignments/reject_R1.fastq.gz"), + reject_2=temp_with_prefix("{dataset}/alignments/reject_R2.fastq.gz"), params: BWA=config.applications["bwa"], SAMTOOLS=config.applications["samtools"], - HUMAN_GENOME=config.dehuman["ref_human"], - remove_reads_script=cachepath( - "../scripts/remove_reads_list.pl", executable=True, localsource=True - ), - stats="{dataset}/raw_data/dehuman.stats", - # set to 1 to trigger matches with human genome (used for testing): - F=2 conda: config.dehuman["conda"] + group: + "dehuman" + #"align" + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.bwa_align["threads"] # config.dehuman["threads"] shell: """ - # create index if not exists: - test -f {params.HUMAN_GENOME}.bwt || {params.BWA} index {params.HUMAN_GENOME} - - echo Filter out SARS-CoV-2 reads ------------------------------------- + echo "Filter out virus' reads -----------------------------------------" echo {params.BWA} mem -t {threads} \ - -o {output.ref_aln} \ - {input.global_ref} {input.R1} {input.R2} + -o {output.tmp_aln} \ + {input.global_ref} {input.fastq} echo - echo Keep reject ----------------------------------------------------- + echo "Keep reject -----------------------------------------------------" echo {params.SAMTOOLS} bam2fq -@ {threads} \ -F 2 \ -1 {output.reject_1} \ -2 {output.reject_2} \ - {output.ref_aln} - + {output.tmp_aln} echo - echo Checking rejects against Homo Sapiens --------------------------- + """ + + + +rule dh_hostalign: + input: + host_ref=config.dehuman["ref_host"], + ref_index="{}.bwt".format(config.dehuman["ref_host"]), + # TODO add option to switch between "redo rejects" and "rely on aligner's output". + reject_1=rules.dh_redo_alignreject.output.reject_1, + reject_2=rules.dh_redo_alignreject.output.reject_2, + output: + host_aln=temp_with_prefix("{dataset}/alignments/host_aln.sam"), + threads: config.dehuman["threads"] + params: + BWA=config.applications["bwa"], + conda: + config.dehuman["conda"] + group: + "dehuman" + #"align" + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.bwa_align["threads"] # config.dehuman["threads"] + shell: + # create index if not exists: + # test -f {input.ref_index} || {params.BWA} index {input.host_ref} + """ + echo "Checking rejects against Homo Sapiens ---------------------------" {params.BWA} mem -t {threads} \ - -o {output.h38_aln}\ - {params.HUMAN_GENOME} {output.reject_1} {output.reject_2} + -o {output.host_aln}\ + {input.host_ref} {input.reject_1} {input.reject_2} echo - echo Count aligned reads --------------------------------------------- + """ + + + +rule dh_filter: + input: + host_aln=temp_prefix("{dataset}/alignments/host_aln.sam"), + R1=partial(raw_data_file, pair=1), + R2=partial(raw_data_file, pair=2), + output: + filter_count="{dataset}/alignments/dehuman.count", + filter_list=temp_with_prefix("{dataset}/alignments/dehuman.filter"), + # TODO shift to pipe + filtered_1=temp_with_prefix("{dataset}/raw_uploads/filtered_1.fasta.gz"), + filtered_2=temp_with_prefix("{dataset}/raw_uploads/filtered_2.fasta.gz"), + params: + SAMTOOLS=config.applications["samtools"], + remove_reads_script=cachepath( + "../scripts/remove_reads_list.pl", executable=True, localsource=True + ), # TODO shoo out the cats + keep_host=int(config.dehuman["keep_host"]), + host_aln_cram="{dataset}/alignments/host_aln.cram", + stats="{dataset}/alignments/dehuman.stats", + # set to 1 to trigger matches with human genome (used for testing): + F=2 + conda: + config.dehuman["conda"] + group: + "dehuman" + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.dehuman["threads"] + shell: + """ + # using zcat FILENAME.gz causes issues on Mac, see + # https://serverfault.com/questions/570024/ + # redirection fixes this: + unpack_rawreads() {{ + for fq in "${{@}}"; do + zcat -f < "${{fq}}" + done + }} + + echo + echo "Count aligned reads ---------------------------------------------" echo - count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {output.h38_aln}) + count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {input.host_aln}) + echo "${{count}}" > {output.filter_count} if (( count > 0 )); then echo - echo ----------------------------------------------------------------- + echo "-----------------------------------------------------------------" echo "Needs special care: ${{count}} potential human reads found" - echo ----------------------------------------------------------------- + echo "-----------------------------------------------------------------" echo - echo Removing identified human reads from raw reads ------------------ + echo "Removing identified human reads from raw reads ------------------" echo # get list {params.SAMTOOLS} view -@ {threads} \ -f {params.F} \ - {output.h38_aln} \ - | cut -f 1 > {output.filter} - - # using zcat FILENAME.gz causes issues on Mac, see - # https://serverfault.com/questions/570024/ - # redirection fixes this: - zcat < {wildcards.dataset}/raw_data/*_R1.fastq.gz \ - | {params.remove_reads_script} {output.filter} \ + {input.host_aln} \ + | cut -f 1 > {output.filter_list} + + unpack_rawreads {input.R1} \ + | {params.remove_reads_script} {output.filter_list} \ | gzip \ > {output.filtered_1} & - zcat < {wildcards.dataset}/raw_data/*_R2.fastq.gz \ - | {params.remove_reads_script} {output.filter} \ + unpack_rawreads {input.R2} \ + | {params.remove_reads_script} {output.filter_list} \ | gzip \ > {output.filtered_2} & wait - # keep the rejects for further analysis + if (( {params.keep_host} )); then + # keep the rejects for further analysis - echo - echo Keeping Human-aligned SARS-CoV-2 rejects ------------------------- - echo + echo + echo "Keeping Human-aligned virus' rejects -----------------------------" + echo - # (we compress reference-less, because the reference size is larger - # than the contaminant reads) - FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 - {params.SAMTOOLS} sort -@ {threads} \ - -M \ - --output-fmt ${{FMT}} \ - -o {output.h38_aln_cram} \ - {output.h38_aln} + # (we compress reference-less, because the reference size is larger + # than the contaminant reads) + FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + {params.SAMTOOLS} sort -@ {threads} \ + -M \ + --output-fmt ${{FMT}} \ + -o {params.host_aln_cram} \ + {input.host_aln} - echo - echo Compressing human-depleted raw reads ----------------------------- - echo - - {params.SAMTOOLS} index -@ {threads} {output.h38_aln_cram} - {params.SAMTOOLS} stats -@ {threads} {output.h38_aln_cram} > {params.stats} + echo + echo "Compressing human-depleted raw reads -----------------------------" + echo + {params.SAMTOOLS} index -@ {threads} {params.host_aln_cram} + {params.SAMTOOLS} stats -@ {threads} {params.host_aln_cram} > {params.stats} + fi else echo - echo No potential human reads found ----------------------------------- - echo Copy raw reads file + echo "No potential human reads found -----------------------------------" + echo "Copy raw reads file" echo - cp {wildcards.dataset}/raw_data/*_R1.fastq.gz {output.filtered_1} & - cp {wildcards.dataset}/raw_data/*_R2.fastq.gz {output.filtered_2} & + # HACK cheating as currently we only use gziped data + #unpack_rawreads {input.R1} | gzip > {output.filtered_1} & + #unpack_rawreads {input.R2} | gzip > {output.filtered_2} & + cat {input.R1} > {output.filtered_1} & + cat {input.R2} > {output.filtered_2} & wait - touch {output.filter} - touch {output.h38_aln_cram} + touch {output.filter_list} fi - echo - echo Compress filtered sequences -------------------------------------- + """ + + + +# TODO move cram compression into align.smk + +rule dehuman: + input: + global_ref=reference_file, + filtered_1=rules.dh_filter.output.filtered_1, # =temp_prefix("{dataset}/raw_uploads/filtered_1.fasta.gz"), + filtered_2=rules.dh_filter.output.filtered_2, # =temp_prefix("{dataset}/raw_uploads/filtered_2.fasta.gz"), + output: + cram_sam=temp_with_prefix("{dataset}/raw_uploads/dehuman.sam"), + final_cram="{dataset}/raw_uploads/dehuman.cram", + checksum="{dataset}/raw_uploads/dehuman.cram.%s" % config.general["checksum"], + params: + BWA=config.applications["bwa"], + SAMTOOLS=config.applications["samtools"], + checksum_type=config.general["checksum"], + conda: + config.dehuman["conda"] + group: + "dehuman" + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.dehuman["threads"] + shell: + """ + echo "Compress filtered sequences --------------------------------------" echo {params.BWA} mem -t {threads} \ -C \ -o {output.cram_sam} \ - {input.global_ref} {output.filtered_1} {output.filtered_2} + {input.global_ref} {input.filtered_1} {input.filtered_2} + # HACK handle incompatibilities between: + # - Illumina's 'bcl2fastq', which write arbitrary strings + # - 'bwa mem' which keep comments in the SAM file verbatim as in the FASTQ file + # - 'samtools' which expects comment to be properly marked as 'BC:Z:' + # as per SAM format specs REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}}{{BC:Z:\\1}}\' FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 @@ -150,6 +250,9 @@ rule dehuman: --reference {input.global_ref} \ --output-fmt ${{FMT}} \ -o {output.final_cram} + + {params.checksum_type}sum {output.final_cram} > {output.checksum} + echo echo DONE ------------------------------------------------------------- echo diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index b95ea9cc6..a0ab43ca6 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -41,6 +41,13 @@ "description": "This option should be used to specify the default number of threads for all multi-threaded rules. That is, unless the number of threads is specified for each rule, this value is set as default.", "examples": [4] }, + "checksum": { + "type": "string", + "default": "md5", + "enum": ["md5", "sha1", "sha256", "sha224", "sha384", "sha512", "xxh64", "xxh32", "xxh128"], + "description": "Sets the algorithm to be used when computing checksums for uploadable data.", + "examples": ["sha256"] + }, "temp_prefix": { "type": "string", "default": "", @@ -178,7 +185,13 @@ "dehumanized_raw_reads": { "type": "boolean", "default": false, - "description": "This option turns on dehumanization of the raw reads", + "description": "This option turns on dehumanization of the raw reads (i.e. removal of host's reads). This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive)", + "examples": [true] + }, + "upload": { + "type": "boolean", + "default": false, + "description": "This option can be used for assistance in incremental upload of data", "examples": [true] } }, @@ -1255,9 +1268,17 @@ "type": "integer", "default": 16 }, - "ref_human": { + "ref_host": { "type": "string", - "default": "references/human.fa" + "default": "references/human.fa", + "description": "Host's genome used to remove reads (e.g. human genome)", + "examples": ["/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"] + }, + "keep_host": { + "type": "boolean", + "default": false, + "description": "Indicate whether to store the host-aligned reads in a CRAM file `…/alignments/host_aln.cram`.", + "examples": [ true ] } }, "default": {}, From 2737e3d337deda9d8ef6f8c8561b9b9d76ac85eb Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 18 Feb 2022 02:59:37 +0100 Subject: [PATCH 13/33] Making prepare_upload more generic: - coreutil on non-linux platforms (realpath, shasum, etc.) - optional inputs (dehuman, fasta checksums, etc.) --- workflow/envs/report_sequences.yaml | 4 -- workflow/envs/upload.yaml | 7 +++ workflow/rules/publish.smk | 77 +++++++++++++++++++++-------- workflow/schemas/config_schema.json | 73 ++++++++++++++++++++++----- 4 files changed, 125 insertions(+), 36 deletions(-) delete mode 100644 workflow/envs/report_sequences.yaml create mode 100644 workflow/envs/upload.yaml diff --git a/workflow/envs/report_sequences.yaml b/workflow/envs/report_sequences.yaml deleted file mode 100644 index f389154ef..000000000 --- a/workflow/envs/report_sequences.yaml +++ /dev/null @@ -1,4 +0,0 @@ -channels: - - conda-forge -dependencies: - - biopython diff --git a/workflow/envs/upload.yaml b/workflow/envs/upload.yaml new file mode 100644 index 000000000..d3b4db650 --- /dev/null +++ b/workflow/envs/upload.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - xxhash + - coreutils # [not linux] diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index 14d45cd63..e1b430191 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,41 +1,76 @@ -localrules: - prepare_upload +if config.upload["conda"]: + localrules: + prepare_upload +def bcft_suffix(): + return '' if config.upload["consensus"] == "majority" else f'_{config.upload["consensus"]}' rule prepare_upload: input: R1=partial(raw_data_file, pair=1), R2=partial(raw_data_file, pair=2), - consensus_aligned="{dataset}/references/ref_majority_dels.fasta", - consensus_unaligned="{dataset}/references/consensus_ambig.bcftools.fasta", + dehuman=["{dataset}/raw_uploads/dehuman.cram", "{dataset}/raw_uploads/dehuman.cram.%s" % config.general["checksum"] ] if config.output["dehumanized_raw_reads"] else [ ], + consensus_indels="{dataset}/references/consensus%s.bcftools.fasta" % bcft_suffix(), + consensus_indels_chain="{dataset}/references/consensus%s.bcftools.chain" % bcft_suffix(), + consensus_aligned="{dataset}/references/ref_majority_dels.fasta", # % config.upload["consensus"], + csum=[ "{dataset}/references/consensus%(suf)s.bcftools.fasta.%(csum)s" % { "suf": bcft_suffix(), "csum":config.general["checksum"] }, + "{dataset}/references/ref_majority_dels.fasta.%(csum)s" % { "suf": bcft_suffix(), "csum":config.general["checksum"] } ] if config.upload["checksum"] else [ ], output: upload_prepared_touch="{dataset}/upload_prepared.touch" - threads: 1 - + params: + sample_id=ID + conda: + # NOTE realpath is a gnu coreutils executable and not available out of the box. We need a conda environment anyway + config.upload["conda"] + resources: + disk_mb=1000, + mem_mb=config.upload["mem"], + time_min=config.upload["time"], + threads: + config.upload["threads"] shell: """ - to_upload=({input}) - - # depending on config dehuman might not exist, this is why we - # did not add it to the input fields: - to_upload+=({wildcards.dataset}/raw_data/dehuman.cram) + to_upload=( {input} ) - for p in ${{to_upload[@]}}; do - test -e $p || continue - fixed_p=$(realpath --relative-to {wildcards.dataset}/uploads $p) - ( set -x; ln -f -s $fixed_p {wildcards.dataset}/uploads ) + mkdir -p "{wildcards.dataset}/uploads/" + for p in "${{to_upload[@]}}"; do + test -e "$p" || continue + fixed_p=$(realpath --relative-to "{wildcards.dataset}/uploads/" "$p") + ( set -x; ln -f -s "$fixed_p" "{wildcards.dataset}/uploads/" ) done - mkdir -p uploads + mkdir -p uploads/ - sample_id=$(basename $(dirname {wildcards.dataset})) - fixed_uploads=$(realpath --relative-to uploads {wildcards.dataset}/uploads) + sample_id={params.sample_id} + fixed_uploads=$(realpath --relative-to "uploads" "{wildcards.dataset}/uploads/") # make unique symbolic link: - random=$(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b | cut -d" " -f 1) - unique_id=${{sample_id}}__${{random}} + read random o < <(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b) + unique_id="${{sample_id}}__${{random}}" - ( set -x; ln -s $fixed_uploads uploads/$unique_id ) + ( set -x; ln -s "$fixed_uploads" "uploads/$unique_id" ) touch {output.upload_prepared_touch} """ + + +rule checksum: + input: + "{file}" + output: + "{file}.%s" % config.general["checksum"] + params: + checksum_type=config.general["checksum"], + conda: + config["upload"]["conda"] + resources: + disk_mb=10, + mem_mb=config.checksum["mem"], + time_min=config.checksum["time"], + threads: 1 + shell: + """ + {params.checksum_type}sum {input} > {output} + """ + +ruleorder: dehuman > checksum diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index a0ab43ca6..532e8b789 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -191,7 +191,7 @@ "upload": { "type": "boolean", "default": false, - "description": "This option can be used for assistance in incremental upload of data", + "description": "This option can be used for assistance in incremental upload of data. See section `upload` for an example.", "examples": [true] } }, @@ -1248,16 +1248,6 @@ "default": {}, "type": "object" }, - "report_sequences": { - "properties": { - "conda": { - "type": "string", - "default": "{VPIPE_BASEDIR}/envs/report_sequences.yaml" - } - }, - "default": {}, - "type": "object" - }, "dehuman": { "properties": { "conda": { @@ -1283,6 +1273,67 @@ }, "default": {}, "type": "object" + }, + "checksum": { + "properties": { + "mem": { + "type": "integer", + "default": 256 + }, + "time": { + "type": "integer", + "default": 60 + }, + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/upload.yaml" + } + }, + "default": {}, + "type": "object" + }, + "upload": { + "properties": { + "mem": { + "type": "integer", + "default": 256 + }, + "time": { + "type": "integer", + "default": 60 + }, + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/upload.yaml", + "description": "The default environment only provides hashing functions (`xxhash`, linux coreutils' `sha`_{nnn}_`sum` collection, etc.) but depending on your needs you would want to provide a custom environment with additional tools (e.g. `sftp`, `rsync`, `curl`, `lftp`, custom specialized cloud uploaders, etc.)", + "example": ["custom_envs/ftp.yaml"] + }, + "threads": { + "type": "integer", + "default": 1 + }, + "local": { + "type": "boolean", + "default": true, + "description": "Don't dispatch the rule to the cluster for execution, run locally.", + "examples": [ false ] + }, + "consensus": { + "type": "string", + "default": "ambig", + "enum": ["ambig", "majority"], + "description": "When preparing data for upload, specifies which consensus sequence should be uploaded", + "examples": ["majority"] + }, + "checksum": { + "type": "boolean", + "default": false, + "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether to whether the new file has changed content or is virtually the same as the previous)", + "examples": [ true ] + } + }, + "default": {}, + "type": "object" } } } From 7ade4cbf34f9c852e7074bb6debb7cb4ebc2b0ad Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 18 Feb 2022 04:33:55 +0100 Subject: [PATCH 14/33] prepare_upload relies on external script - allow user to tailor the script to their needs e.g. uploading to an SFTP server, etc. --- workflow/rules/publish.smk | 26 +----- workflow/schemas/config_schema.json | 13 +++ workflow/scripts/prepare_upload_symlinks.sh | 91 +++++++++++++++++++++ 3 files changed, 108 insertions(+), 22 deletions(-) create mode 100755 workflow/scripts/prepare_upload_symlinks.sh diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index e1b430191..ec9a1e019 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -18,7 +18,9 @@ rule prepare_upload: output: upload_prepared_touch="{dataset}/upload_prepared.touch" params: - sample_id=ID + sample_id=ID, + script=cachepath(config.upload["script"], executable=True), + options=config.upload["options"], conda: # NOTE realpath is a gnu coreutils executable and not available out of the box. We need a conda environment anyway config.upload["conda"] @@ -30,27 +32,7 @@ rule prepare_upload: config.upload["threads"] shell: """ - to_upload=( {input} ) - - mkdir -p "{wildcards.dataset}/uploads/" - for p in "${{to_upload[@]}}"; do - test -e "$p" || continue - fixed_p=$(realpath --relative-to "{wildcards.dataset}/uploads/" "$p") - ( set -x; ln -f -s "$fixed_p" "{wildcards.dataset}/uploads/" ) - done - - mkdir -p uploads/ - - sample_id={params.sample_id} - fixed_uploads=$(realpath --relative-to "uploads" "{wildcards.dataset}/uploads/") - - # make unique symbolic link: - read random o < <(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b) - unique_id="${{sample_id}}__${{random}}" - - ( set -x; ln -s "$fixed_uploads" "uploads/$unique_id" ) - - touch {output.upload_prepared_touch} + {params.script} {params.options} "{output.upload_prepared_touch}" "{params.sample_id}" "{wildcards.dataset}" {input} """ diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 532e8b789..b0b12707e 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1293,6 +1293,7 @@ "type": "object" }, "upload": { + "description": "This section is used to assist and prepare uploads of the data, e.g. to European Nucleotide Archive. By default it calls a script that creates symlinks making it easy to identify new/updated samples between calls of V-pipe. But by using the properties `script` and `options` and adapting the environment provided in property `conda`, it is possible to heavily customize the actions (e.g. it is possible to upload to an SFTP server by calling `sftp` from a modified script). For inspiration, see the default script `prepare_upload_symlinks.sh`.", "properties": { "mem": { "type": "integer", @@ -1330,6 +1331,18 @@ "default": false, "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether to whether the new file has changed content or is virtually the same as the previous)", "examples": [ true ] + }, + "script": { + "type": "string", + "default": "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh", + "description": "Custom script that assists and prepares uploads.\n\nIt will receive the following positional parameters:\n - __: the output file that must be created by the script.\n - __: a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.\n - __: the base directory of the sample.\n - __...: a list of files to consider for upload\n\nFor an example, see the default script `prepare_upload_symlinks.sh`, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.", + "example": ["custom_scripts/uploader.py"] + }, + "options": { + "type": "string", + "default": "", + "description": "named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server informations.", + "example": ["--config custom_scripts/uploader.yaml --"] } }, "default": {}, diff --git a/workflow/scripts/prepare_upload_symlinks.sh b/workflow/scripts/prepare_upload_symlinks.sh new file mode 100755 index 000000000..caa25f409 --- /dev/null +++ b/workflow/scripts/prepare_upload_symlinks.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +# +# Example script to prepare and assist uploads. +# - it will get called whenever the consensus FASTA files or the CRAM-compressed raw-reads are updated by snakemake +# - it will create a per-sample directory '.../uploads/' with symlinks to all uploadable files +# - it will crate a global directory 'uploads' with symlinks to samples that were updated +# - these file aren't tracked by snakemake's DAG +# - thus they can be deleted without triggering a re-build by snakemake +# - but they will be re-created whenever an input file dependency changes +# - it is possible to iteratively scan this global directory between runs of V-pipe to determine +# which are new/updated samples to consider for upload +# - meanwhile the OUTPUT file is tracked by the snakemake DAG +# - it must be provided +# - its destruction will retrigger the prepare_upload rule and this script +# - it can optionally be used by the script to store arbitrary data (e.g. json) +# + +usage() { echo "Usage: $0 [ -h ] [ -n ] [ -- ] [ ... ] + +options: + -n : no random nonce at the end of the global symlinks, + they will be not unique + -- : end of options, start of positional parameters + +positional parameters: + : the output file that must be created by the rule + : a string (with no path separator slashes) that uniquely + identifies the sample and the date + : the base directory of the sample + : a list of files to consider for upload + +Generates symlinks that help tracking new and updated samples to consider +for upload. Serves also as a demo for the upload parameters of V-pipe." 1>&2; exit "$1"; } + +# NOTE it is possible to have named options (e.g. with getops) before the named options begin +do_random_nonce=1 +while getopts "nh" o; do + case "${o}" in + n) do_random_nonce=0 ;; + h) usage 0 ;; + *) usage 1 ;; + esac +done +shift $((OPTIND-1)) + + + + +## get the positional parameters + +output="${1}" +sample_id="${2//\//_}" +sample_dir="${3}" +shift 3 +to_upload=( "${@}" ) + + + +## process + +# create and populate per-sample directory +mkdir -p "${sample_dir}/uploads/" + +for p in "${to_upload[@]}"; do + test -e "${p}" || continue + fixed_p="$(realpath --relative-to "${sample_dir}/uploads/" "${p}")" + ( set -x; ln -f -s "$fixed_p" "${sample_dir}/uploads/" ) +done + + +# create and populare global directory +mkdir -p uploads/ + +fixed_uploads="$(realpath --relative-to "uploads" "${sample_dir}/uploads/")" + +# make unique symbolic link: +if (( do_random_nonce )); then + read random o < <(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b) + unique_id="${sample_id}__${random}" + force="" +else + unique_id="${sample_id}" + force="f" +fi + +( set -x; ln "-s${force}" "$fixed_uploads" "uploads/$unique_id" ) + + +# create the mandatory output file +touch "${output}" From 7a8fd3c529a96726eff011cf00cacf5eb4ed011b Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 18 Feb 2022 04:54:51 +0100 Subject: [PATCH 15/33] keep the linters happy --- workflow/Snakefile | 3 +- workflow/rules/align.smk | 15 +++---- workflow/rules/common.smk | 16 +++++-- workflow/rules/dehuman.smk | 16 ++++--- workflow/rules/publish.smk | 46 +++++++++++++++------ workflow/rules/quality_assurance.smk | 13 +++--- workflow/scripts/prepare_upload_symlinks.sh | 2 +- 7 files changed, 72 insertions(+), 39 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index b54637e2c..442bafe83 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -18,7 +18,8 @@ functions = srcdir("scripts/functions.sh") # DUMMY RULES rule all: input: - all_files + all_files, + rule alltrimmed: input: diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 61414618a..83d4c9564 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -331,9 +331,10 @@ if config.general["aligner"] == "ngshmmalign": mv {wildcards.dataset}/{{alignments,references}}/ref_ambig.fasta mv {wildcards.dataset}/{{alignments,references}}/ref_majority.fasta """ +# 3. construct MSA from all patient files + -# 3. construct MSA from all patient files def construct_msa_input_files(wildcards): output_list = ["{}{}.fasta".format(s, wildcards.kind) for s in references] output_list.append(reference_file) @@ -406,11 +407,10 @@ if config.general["aligner"] == "ngshmmalign": """ {params.CONVERT_REFERENCE} -t {params.REF_NAME} -m {input.REF_ambig} -i {input.BAM} -o {output} > {log.outfile} 2> >(tee {log.errfile} >&2) """ - - # 2-4. Alternative: align reads using bwa or bowtie + rule sam2bam: input: temp_prefix("{file}.sam"), @@ -447,7 +447,7 @@ rule ref_bwa_index: input: reference_file, output: - "{}.bwt".format(reference_file), + "{}.bwt".format(reference_file), #all indexing files: .amb .ann .bwt .fai .pac .sa params: BWA=config.applications["bwa"], log: @@ -469,6 +469,7 @@ rule ref_bwa_index: {params.BWA} index {input} 2> >(tee {log.errfile} >&2) """ + if config.general["aligner"] == "bwa": def input_align_gz(wildcards): @@ -483,7 +484,6 @@ if config.general["aligner"] == "bwa": return list_output rule bwa_align: - # all indexing files: .amb .ann .bwt .fai .pac .sa input: FASTQ=input_align_gz, REF=reference_file, @@ -501,7 +501,7 @@ if config.general["aligner"] == "bwa": errfile="{dataset}/alignments/bwa_align.err.log", conda: config.bwa_align["conda"] - # shadow: "minimal" # HACK way too many indexing files, using explicit OUT instead + #shadow: "minimal" # HACK way too many indexing files, using explicit OUT instead benchmark: "{dataset}/alignments/bwa_align.benchmark" group: @@ -641,9 +641,10 @@ elif config.general["aligner"] == "bowtie": {params.SAMTOOLS} view -h -F 4 -F 2048 -o "{output.REF}" "{output.TMP_SAM} 2> >(tee -a {log.errfile} >&2) rm {params.TMP_SAM} """ +# NOTE ngshmmalignb also generate consensus so check there too. + -# NOTE ngshmmalignb also generate consensus so check there too. # if config.general["aligner"] == "ngshmmalign": # # ruleorder: convert_to_ref > sam2bam diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7bbc2988d..87e1c2dd5 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -447,8 +447,14 @@ for p in patient_list: upload_markers.append(os.path.join(sdir, "upload_prepared.touch")) # merge lists containing expected output - all_files = (alignments + consensus + results + visualizations - + dehumanized_raw_reads + upload_markers) + all_files = ( + alignments + + consensus + + results + + visualizations + + dehumanized_raw_reads + + upload_markers + ) # diversity measures if not config.output["snv"] and config.output["diversity"]: @@ -537,11 +543,13 @@ def get_maxins(wildcards): def rebase_datadir(base, dataset): return os.path.join(base, *os.path.normpath(dataset).split(os.path.sep)[-2:]) + def raw_data_file(wildcards, pair): - for p in os.listdir('{dataset}/raw_data'.format(dataset=wildcards.dataset)): + for p in os.listdir("{dataset}/raw_data".format(dataset=wildcards.dataset)): if re.search(r".*R{pair}\.(fastq\.gz|fastq|fq|fq\.gz)$".format(pair=pair), p): return os.path.join(wildcards.dataset, "raw_data", p) + # TODO replace with raw_data_file def construct_input_fastq(wildcards): @@ -591,8 +599,10 @@ def construct_input_fastq(wildcards): return list_output + def temp_prefix(p): return os.path.join(config.general["temp_prefix"], p) + def temp_with_prefix(p): return temp(temp_prefix(p)) diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 6926b08ee..72906802e 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -19,13 +19,13 @@ rule dh_redo_alignreject: conda: config.dehuman["conda"] group: - "dehuman" #"align" + "dehuman" resources: disk_mb=1250, mem_mb=config.bwa_align["mem"], time_min=config.bwa_align["time"], - threads: config.bwa_align["threads"] # config.dehuman["threads"] + threads: config.bwa_align["threads"] # config.dehuman["threads"] shell: """ echo "Filter out virus' reads -----------------------------------------" @@ -48,7 +48,6 @@ rule dh_redo_alignreject: """ - rule dh_hostalign: input: host_ref=config.dehuman["ref_host"], @@ -65,12 +64,11 @@ rule dh_hostalign: config.dehuman["conda"] group: "dehuman" - #"align" resources: disk_mb=1250, mem_mb=config.bwa_align["mem"], time_min=config.bwa_align["time"], - threads: config.bwa_align["threads"] # config.dehuman["threads"] + threads: config.bwa_align["threads"] # config.dehuman["threads"] shell: # create index if not exists: # test -f {input.ref_index} || {params.BWA} index {input.host_ref} @@ -85,7 +83,6 @@ rule dh_hostalign: """ - rule dh_filter: input: host_aln=temp_prefix("{dataset}/alignments/host_aln.sam"), @@ -101,12 +98,13 @@ rule dh_filter: SAMTOOLS=config.applications["samtools"], remove_reads_script=cachepath( "../scripts/remove_reads_list.pl", executable=True, localsource=True - ), # TODO shoo out the cats + ), + # TODO shoo out the cats keep_host=int(config.dehuman["keep_host"]), host_aln_cram="{dataset}/alignments/host_aln.cram", stats="{dataset}/alignments/dehuman.stats", # set to 1 to trigger matches with human genome (used for testing): - F=2 + F=2, conda: config.dehuman["conda"] group: @@ -201,9 +199,9 @@ rule dh_filter: """ - # TODO move cram compression into align.smk + rule dehuman: input: global_ref=reference_file, diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index ec9a1e019..fadf18601 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,22 +1,42 @@ if config.upload["conda"]: + localrules: - prepare_upload + prepare_upload, + def bcft_suffix(): - return '' if config.upload["consensus"] == "majority" else f'_{config.upload["consensus"]}' + return ( + "" + if config.upload["consensus"] == "majority" + else f'_{config.upload["consensus"]}' + ) + rule prepare_upload: input: R1=partial(raw_data_file, pair=1), R2=partial(raw_data_file, pair=2), - dehuman=["{dataset}/raw_uploads/dehuman.cram", "{dataset}/raw_uploads/dehuman.cram.%s" % config.general["checksum"] ] if config.output["dehumanized_raw_reads"] else [ ], - consensus_indels="{dataset}/references/consensus%s.bcftools.fasta" % bcft_suffix(), - consensus_indels_chain="{dataset}/references/consensus%s.bcftools.chain" % bcft_suffix(), - consensus_aligned="{dataset}/references/ref_majority_dels.fasta", # % config.upload["consensus"], - csum=[ "{dataset}/references/consensus%(suf)s.bcftools.fasta.%(csum)s" % { "suf": bcft_suffix(), "csum":config.general["checksum"] }, - "{dataset}/references/ref_majority_dels.fasta.%(csum)s" % { "suf": bcft_suffix(), "csum":config.general["checksum"] } ] if config.upload["checksum"] else [ ], + dehuman=[ + "{dataset}/raw_uploads/dehuman.cram", + "{dataset}/raw_uploads/dehuman.cram.%s" % config.general["checksum"], + ] + if config.output["dehumanized_raw_reads"] + else [], + consensus_indels="{dataset}/references/consensus%s.bcftools.fasta" + % bcft_suffix(), + consensus_indels_chain="{dataset}/references/consensus%s.bcftools.chain" + % bcft_suffix(), + consensus_aligned="{dataset}/references/ref_majority_dels.fasta", # % config.upload["consensus"], + csum=[ + "{dataset}/references/consensus%(suf)s.bcftools.fasta.%(csum)s" + % {"suf": bcft_suffix(), "csum": config.general["checksum"]}, + "{dataset}/references/ref_majority_dels.fasta.%(csum)s" + % {"suf": bcft_suffix(), "csum": config.general["checksum"]}, + ] + if config.upload["checksum"] + else [], output: - upload_prepared_touch="{dataset}/upload_prepared.touch" + upload_prepared_touch="{dataset}/upload_prepared.touch", params: sample_id=ID, script=cachepath(config.upload["script"], executable=True), @@ -28,8 +48,7 @@ rule prepare_upload: disk_mb=1000, mem_mb=config.upload["mem"], time_min=config.upload["time"], - threads: - config.upload["threads"] + threads: config.upload["threads"] shell: """ {params.script} {params.options} "{output.upload_prepared_touch}" "{params.sample_id}" "{wildcards.dataset}" {input} @@ -38,9 +57,9 @@ rule prepare_upload: rule checksum: input: - "{file}" + "{file}", output: - "{file}.%s" % config.general["checksum"] + "{file}.%s" % config.general["checksum"], params: checksum_type=config.general["checksum"], conda: @@ -55,4 +74,5 @@ rule checksum: {params.checksum_type}sum {input} > {output} """ + ruleorder: dehuman > checksum diff --git a/workflow/rules/quality_assurance.smk b/workflow/rules/quality_assurance.smk index 3c28bead8..a72461298 100644 --- a/workflow/rules/quality_assurance.smk +++ b/workflow/rules/quality_assurance.smk @@ -31,8 +31,8 @@ rule gunzip: rule extract: input: - construct_input_fastq, # TODO replace with raw_data_file + construct_input_fastq, output: temp( os.path.join( @@ -52,10 +52,10 @@ rule extract: time_min=config.extract["time"], threads: 1 shell: + # TODO replace with better dedicated software """ cat {input} | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > {output} 2> >(tee {log.errfile} >&2) """ - # TODO replace with better dedicated software # 2. clipping @@ -95,7 +95,8 @@ if config.input["paired"]: "minimal" benchmark: "{dataset}/preprocessed_data/prinseq.benchmark" - # group: 'preprocessing' + #group: + #'preprocessing' resources: disk_mb=20000, mem_mb=config.preprocessing["mem"], @@ -147,7 +148,8 @@ else: "minimal" benchmark: "{dataset}/preprocessed_data/prinseq.benchmark" - # group: 'preprocessing' + #group: + #'preprocessing' resources: disk_mb=10000, mem_mb=config.preprocessing["mem"], @@ -172,9 +174,10 @@ else: gzip {wildcards.dataset}/preprocessed_data/R1.fastq """ +# 3. QC reports + -# 3. QC reports rule fastqc: input: os.path.join( diff --git a/workflow/scripts/prepare_upload_symlinks.sh b/workflow/scripts/prepare_upload_symlinks.sh index caa25f409..af34f721e 100755 --- a/workflow/scripts/prepare_upload_symlinks.sh +++ b/workflow/scripts/prepare_upload_symlinks.sh @@ -76,7 +76,7 @@ fixed_uploads="$(realpath --relative-to "uploads" "${sample_dir}/uploads/")" # make unique symbolic link: if (( do_random_nonce )); then - read random o < <(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b) + read -r random o < <(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b) unique_id="${sample_id}__${random}" force="" else From 9ff1fb5b39f1c600b65a567525e67a57d3ff3609 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Mon, 21 Feb 2022 19:48:26 +0100 Subject: [PATCH 16/33] Who let the cats out? --- .mega-linter.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.mega-linter.yml b/.mega-linter.yml index 8959909ee..9350792fc 100644 --- a/.mega-linter.yml +++ b/.mega-linter.yml @@ -9,6 +9,7 @@ ENABLE_LINTERS: - SNAKEMAKE_SNAKEFMT - MARKDOWN_MARKDOWNLINT - JUPYTER_JUPYFMT + - PERL_PERLCRITIC VALIDATE_ALL_CODEBASE: true FORMATTERS_DISABLE_ERRORS: false From e96ca9c7758bdd807d01e07a816601e860ea7361 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Mon, 21 Feb 2022 20:04:48 +0100 Subject: [PATCH 17/33] Circumventing snakefmt regression - comment in indented rule block inside condition --- workflow/rules/align.smk | 2 +- workflow/rules/quality_assurance.smk | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 83d4c9564..8201cf539 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -483,6 +483,7 @@ if config.general["aligner"] == "bwa": ) return list_output + # HACK way too many indexing files (see ref_bwa_index), not using shadow: minimal rule bwa_align: input: FASTQ=input_align_gz, @@ -501,7 +502,6 @@ if config.general["aligner"] == "bwa": errfile="{dataset}/alignments/bwa_align.err.log", conda: config.bwa_align["conda"] - #shadow: "minimal" # HACK way too many indexing files, using explicit OUT instead benchmark: "{dataset}/alignments/bwa_align.benchmark" group: diff --git a/workflow/rules/quality_assurance.smk b/workflow/rules/quality_assurance.smk index a72461298..73504729c 100644 --- a/workflow/rules/quality_assurance.smk +++ b/workflow/rules/quality_assurance.smk @@ -95,8 +95,6 @@ if config.input["paired"]: "minimal" benchmark: "{dataset}/preprocessed_data/prinseq.benchmark" - #group: - #'preprocessing' resources: disk_mb=20000, mem_mb=config.preprocessing["mem"], @@ -148,8 +146,6 @@ else: "minimal" benchmark: "{dataset}/preprocessed_data/prinseq.benchmark" - #group: - #'preprocessing' resources: disk_mb=10000, mem_mb=config.preprocessing["mem"], From f9d0320c57b51db0559caf5adecf16a68489b208 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Mon, 21 Feb 2022 20:13:55 +0100 Subject: [PATCH 18/33] More code reuse: temp prefix --- workflow/rules/quality_assurance.smk | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/workflow/rules/quality_assurance.smk b/workflow/rules/quality_assurance.smk index 73504729c..e5e0bdd0a 100644 --- a/workflow/rules/quality_assurance.smk +++ b/workflow/rules/quality_assurance.smk @@ -34,11 +34,7 @@ rule extract: # TODO replace with raw_data_file construct_input_fastq, output: - temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R{pair}.fastq" - ) - ), + temp_with_prefix("{dataset}/extracted_data/R{pair}.fastq"), log: outfile="{dataset}/extracted_data/extract_R{pair}.out.log", errfile="{dataset}/extracted_data/extract_R{pair}.err.log", @@ -73,12 +69,8 @@ if config.input["paired"]: rule preprocessing: input: - R1=os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R1.fastq" - ), - R2=os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R2.fastq" - ), + R1=temp_prefix("{dataset}/extracted_data/R1.fastq"), + R2=temp_prefix("{dataset}/extracted_data/R2.fastq"), output: R1gz="{dataset}/preprocessed_data/R1.fastq.gz", R2gz="{dataset}/preprocessed_data/R2.fastq.gz", @@ -128,9 +120,7 @@ else: rule preprocessing_se: input: - R1=os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R1.fastq" - ), + R1=temp_prefix("{dataset}/extracted_data/R1.fastq"), output: R1gz="{dataset}/preprocessed_data/R1.fastq.gz", params: @@ -176,9 +166,7 @@ else: rule fastqc: input: - os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R{pair}.fastq" - ), + temp_prefix("{dataset}/extracted_data/R{pair}.fastq"), output: "{dataset}/extracted_data/R{pair}_fastqc.html", params: From 05543ed65eb7efd2883aba3fc503b86b08bb35ca Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Mon, 21 Feb 2022 20:25:39 +0100 Subject: [PATCH 19/33] Bug detected by HIV end-to-end testing --- workflow/rules/align.smk | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 8201cf539..ea0b67f2b 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -470,18 +470,17 @@ rule ref_bwa_index: """ -if config.general["aligner"] == "bwa": - - def input_align_gz(wildcards): - list_output = [] +def input_align_gz(wildcards): + list_output = [] + list_output.append(os.path.join(wildcards.dataset, "preprocessed_data/R1.fastq.gz")) + if config.input["paired"]: list_output.append( - os.path.join(wildcards.dataset, "preprocessed_data/R1.fastq.gz") + os.path.join(wildcards.dataset, "preprocessed_data/R2.fastq.gz") ) - if config.input["paired"]: - list_output.append( - os.path.join(wildcards.dataset, "preprocessed_data/R2.fastq.gz") - ) - return list_output + return list_output + + +if config.general["aligner"] == "bwa": # HACK way too many indexing files (see ref_bwa_index), not using shadow: minimal rule bwa_align: From bd911d153a3207df5768f6d26f1a2b4de2ca4874 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Tue, 22 Feb 2022 01:47:26 +0100 Subject: [PATCH 20/33] Reorganize dehuman to reuse main aligner - reuse rejects from bwa/bowtie/ngshmmalign - option to redo a (bwa) align to rejects, WITHOUT forcing to reprocess everything else --- workflow/rules/align.smk | 2 - workflow/rules/common.smk | 2 +- workflow/rules/dehuman.smk | 61 +++++++++++++++++++++-------- workflow/schemas/config_schema.json | 16 +++++++- 4 files changed, 61 insertions(+), 20 deletions(-) diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index ea0b67f2b..9f6c752c6 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -594,7 +594,6 @@ elif config.general["aligner"] == "bowtie": {params.BOWTIE} -x {input.REF} -1 {input.R1} -2 {input.R2} {params.PHRED} {params.PRESET} -X {params.MAXINS} {params.EXTRA} -p {threads} -S {output.TMP_SAM} 2> >(tee {log.errfile} >&2) # Filter alignments: (1) keep only reads mapped in proper pairs, and (2) remove supplementary aligments {params.SAMTOOLS} view -h -f 2 -F 2048 -o "{output.REF}" "{output.TMP_SAM}" 2> >(tee -a {log.errfile} >&2) - rm {params.TMP_SAM} """ @@ -638,7 +637,6 @@ elif config.general["aligner"] == "bowtie": {params.BOWTIE} -x {input.REF} -U {input.R1} {params.PHRED} {params.PRESET} {params.EXTRA} -p {threads} -S {output.TMP_SAM} 2> >(tee {log.errfile} >&2) # Filter alignments: (1) remove unmapped reads, and (2) remove supplementary aligments {params.SAMTOOLS} view -h -F 4 -F 2048 -o "{output.REF}" "{output.TMP_SAM} 2> >(tee -a {log.errfile} >&2) - rm {params.TMP_SAM} """ # NOTE ngshmmalignb also generate consensus so check there too. diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 87e1c2dd5..7c0c8c8e1 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -531,7 +531,7 @@ def shifts(wildcards): def get_maxins(wildcards): - if config.bowtie_align["maxins"]: + if "maxins" in config["bowtie_align"]: return config.bowtie_align["maxins"] else: patient_ID, date = os.path.normpath(wildcards.dataset).split(os.path.sep)[-2:] diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 72906802e..e314c0959 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -1,16 +1,50 @@ from functools import partial +rule dh_reuse_alignreject: + # "rely on aligner's output". + # this rule re-use the rejected reads in align.smk (e.g. ngshmmalign's /alignments/rejects.sam) + # (useful when in parallel with the main processing) + input: + reject_aln=rules.hmm_align.output.reject_aln if config["general"]["aligner"] == "ngshmmalign" else temp_prefix("{dataset}/alignments/tmp_aln.sam") + output: + reject_1=temp_with_prefix("{dataset}/alignments/reject_R1.fastq.gz"), + reject_2=temp_with_prefix("{dataset}/alignments/reject_R2.fastq.gz"), + params: + SAMTOOLS=config.applications["samtools"], + conda: + config.dehuman["conda"] + group: + "align" + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.bwa_align["threads"] + shell: + """ + echo "Keep reject -----------------------------------------------------" + echo + + {params.SAMTOOLS} bam2fq -@ {threads} \ + -F 2 \ + -1 {output.reject_1} \ + -2 {output.reject_2} \ + {input.reject_aln} + echo + """ + + rule dh_redo_alignreject: - # this rule re-does an alignment to get the reject (useful if aligner's (temp) rejects have been deleted by now) - # TODO add support for rejected reads in align.smk (e.g. ngshmmalign already does /alignments/rejects.sam) + # "redo rejects" + # this rule re-does an alignment to get the reject + # (useful if aligner's (temp) rejects have been deleted by now) input: global_ref=reference_file, ref_index="{}.bwt".format(reference_file), fastq=input_align_gz, output: - tmp_aln=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), - # TODO add option to switch between "redo rejects" and "rely on aligner's output". + tmp_aln=temp_with_prefix("{dataset}/alignments/dh_aln.sam"), reject_1=temp_with_prefix("{dataset}/alignments/reject_R1.fastq.gz"), reject_2=temp_with_prefix("{dataset}/alignments/reject_R2.fastq.gz"), params: @@ -19,13 +53,12 @@ rule dh_redo_alignreject: conda: config.dehuman["conda"] group: - #"align" "dehuman" resources: disk_mb=1250, mem_mb=config.bwa_align["mem"], time_min=config.bwa_align["time"], - threads: config.bwa_align["threads"] # config.dehuman["threads"] + threads: config.bwa_align["threads"] shell: """ echo "Filter out virus' reads -----------------------------------------" @@ -52,12 +85,10 @@ rule dh_hostalign: input: host_ref=config.dehuman["ref_host"], ref_index="{}.bwt".format(config.dehuman["ref_host"]), - # TODO add option to switch between "redo rejects" and "rely on aligner's output". - reject_1=rules.dh_redo_alignreject.output.reject_1, - reject_2=rules.dh_redo_alignreject.output.reject_2, + reject_1=rules.dh_redo_alignreject.output.reject_1 if config["dehuman"]["catchup"] else rules.dh_reuse_alignreject.output.reject_1, + reject_2=rules.dh_redo_alignreject.output.reject_2 if config["dehuman"]["catchup"] else rules.dh_reuse_alignreject.output.reject_2, output: host_aln=temp_with_prefix("{dataset}/alignments/host_aln.sam"), - threads: config.dehuman["threads"] params: BWA=config.applications["bwa"], conda: @@ -68,7 +99,7 @@ rule dh_hostalign: disk_mb=1250, mem_mb=config.bwa_align["mem"], time_min=config.bwa_align["time"], - threads: config.bwa_align["threads"] # config.dehuman["threads"] + threads: config.bwa_align["threads"] shell: # create index if not exists: # test -f {input.ref_index} || {params.BWA} index {input.host_ref} @@ -102,7 +133,6 @@ rule dh_filter: # TODO shoo out the cats keep_host=int(config.dehuman["keep_host"]), host_aln_cram="{dataset}/alignments/host_aln.cram", - stats="{dataset}/alignments/dehuman.stats", # set to 1 to trigger matches with human genome (used for testing): F=2, conda: @@ -111,8 +141,8 @@ rule dh_filter: "dehuman" resources: disk_mb=1250, - mem_mb=config.bwa_align["mem"], - time_min=config.bwa_align["time"], + mem_mb=config.dehuman["mem"], + time_min=config.dehuman["time"], threads: config.dehuman["threads"] shell: """ @@ -180,7 +210,6 @@ rule dh_filter: echo {params.SAMTOOLS} index -@ {threads} {params.host_aln_cram} - {params.SAMTOOLS} stats -@ {threads} {params.host_aln_cram} > {params.stats} fi else echo @@ -223,7 +252,7 @@ rule dehuman: disk_mb=1250, mem_mb=config.bwa_align["mem"], time_min=config.bwa_align["time"], - threads: config.dehuman["threads"] + threads: config.bwa_align["threads"] shell: """ echo "Compress filtered sequences --------------------------------------" diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index b0b12707e..5065bbff8 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1254,9 +1254,17 @@ "type": "string", "default": "{VPIPE_BASEDIR}/envs/dehuman.yaml" }, + "mem": { + "type": "integer", + "default": 1250 + }, + "time": { + "type": "integer", + "default": 235 + }, "threads": { "type": "integer", - "default": 16 + "default": 4 }, "ref_host": { "type": "string", @@ -1269,6 +1277,12 @@ "default": false, "description": "Indicate whether to store the host-aligned reads in a CRAM file `…/alignments/host_aln.cram`.", "examples": [ true ] + }, + "catchup": { + "type": "boolean", + "default": false, + "description": "Use this option when generating dehumanized raw reads (`dehuman.cram`) on old samples _**that have already**_ been processed in the past - a catch up.\n\nNormally, removing host-mapping reads requires analyzing reads which where rejected by main processing's mapping to the reference (as specified in section `general`, property `aligner`). But this output is considered temporary and will get deleted by Snakemake once the processing of sample has finished. To generate `dehuman.cram` V-pipe would need to run the aligner again, which will both regenerate the data necessary for output this but also generate a new alignment which will trigger again processing every other step that depends on that alignment.\nUse this property `catchup` to only generate the input necessary for `dehuman.cram`, leaving untouched the alignment and everything else that has been already been processed.", + "examples": [ true ] } }, "default": {}, From f13ac46cebc74339b0a18e5df30224e5e1a1d190 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Tue, 22 Feb 2022 02:42:11 +0100 Subject: [PATCH 21/33] Fix local option and results dir - "local" option in dehuman.smk (was using wrong config property) - "results" and "samples" are separate in common.smk --- workflow/rules/common.smk | 24 +++++++++++++++++++++--- workflow/rules/publish.smk | 2 +- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7c0c8c8e1..c08b612bd 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -545,9 +545,27 @@ def rebase_datadir(base, dataset): def raw_data_file(wildcards, pair): - for p in os.listdir("{dataset}/raw_data".format(dataset=wildcards.dataset)): - if re.search(r".*R{pair}\.(fastq\.gz|fastq|fq|fq\.gz)$".format(pair=pair), p): - return os.path.join(wildcards.dataset, "raw_data", p) + indir = os.path.join( + rebase_datadir(config.input["datadir"], wildcards.dataset), "raw_data" + ) + list_output = [] + rx_fq = re.compile( + r"[^/]*{}\.(fastq\.gz|fastq|fq|fq\.gz)$".format( + ("R%u%s" % (pair, config.input["fastq_suffix"])) if pair else "" + ) + ) + for p in os.listdir(indir): + if rx_fq.search(p): + list_output.append(os.path.join(indir, p)) + + if len(list_output) == 0: + raise ValueError( + "Missing input files for sample in: {} - Unexpected file name?".format( + indir + ) + ) + + return list_output # TODO replace with raw_data_file diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index fadf18601..f1f62d199 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -1,4 +1,4 @@ -if config.upload["conda"]: +if config.upload["local"]: localrules: prepare_upload, From 066dcbdd3c9da8ab694ab4d8713c5ff9baab4437 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Tue, 22 Feb 2022 02:43:05 +0100 Subject: [PATCH 22/33] Test Lint and upload - keep the linter happy - runs tests on upload.smk, too - lint Docker --- .gitignore | 1 + .mega-linter.yml | 1 + Dockerfile | 19 +++++++++++-------- tests/regression_tests.sh | 1 + workflow/rules/dehuman.smk | 12 +++++++++--- 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index bc5835048..79bdf212d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ __pycache__/ .Python env/ build/ +vendor/ develop-eggs/ dist/ downloads/ diff --git a/.mega-linter.yml b/.mega-linter.yml index 9350792fc..edf1fbaf9 100644 --- a/.mega-linter.yml +++ b/.mega-linter.yml @@ -10,6 +10,7 @@ ENABLE_LINTERS: - MARKDOWN_MARKDOWNLINT - JUPYTER_JUPYFMT - PERL_PERLCRITIC + - DOCKERFILE_HADOLINT VALIDATE_ALL_CODEBASE: true FORMATTERS_DISABLE_ERRORS: false diff --git a/Dockerfile b/Dockerfile index 7b382cf29..913f6fc54 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ ARG vpipe_path ARG envs_path ARG test_data +# hadolint ignore=DL3008 RUN apt-get update && apt-get install -y --no-install-recommends \ jdupes @@ -42,11 +43,11 @@ COPY tests/data ${test_data} WORKDIR /work # configuration: activate all steps -RUN mkdir config -RUN echo 'output:\n snv: true\n local: true\n global: true\n visualization: true\n diversity: true\n QA: true' > config/config.yaml +RUN mkdir config \ + && printf 'output:\n snv: true\n local: true\n global: true\n visualization: true\n diversity: true\n QA: true\n upload: true' > config/config.yaml # TODO harmonize list with CI tests and Docker tests -RUN for virus in ${virus_download_list:-$(ls ${test_data}/)}; do echo "\n\n\e[36;1mvirus: ${virus}\e[0m\n" \ +RUN for virus in ${virus_download_list:-$(ls ${test_data}/)}; do printf '\n\n\e[36;1mvirus: %s\e[0m\n' "${virus}" \ && ln -sf "${test_data}/${virus}/" ./samples \ && if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi \ && PYTHONUNBUFFERED=1 snakemake -s ${vpipe_path}/workflow/Snakefile -j 1 --conda-create-envs-only --use-conda --conda-prefix ${envs_path} --config "general={virus_base_config: ${virus}}" \ @@ -62,6 +63,7 @@ FROM snakemake/snakemake:${snaketag} AS vpipe-tests-base ARG install_path # NOTE rsync only used with local scratch +# hadolint ignore=DL3008 RUN apt-get update && apt-get install -y --no-install-recommends \ rsync \ && apt-get clean \ @@ -83,8 +85,8 @@ ARG test_data ENV virus=hiv WORKDIR /work -RUN mkdir config -RUN echo 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true' > config/config.yaml +RUN mkdir config \ + && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true' > config/config.yaml COPY --from=create-envs ${test_data}/${virus} ./samples RUN if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi # NOTE see top comment if `--network=none` breaks build process @@ -105,8 +107,8 @@ ARG test_data ENV virus=sars-cov-2 WORKDIR /work -RUN mkdir config -RUN echo 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true' > config/config.yaml +RUN mkdir config \ + && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true' > config/config.yaml COPY --from=create-envs ${test_data}/${virus} ./samples RUN if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi # NOTE see top comment if `--network=none` breaks build process @@ -145,6 +147,7 @@ ARG install_path ARG vpipe_path ARG envs_path +# hadolint ignore=DL3008 RUN apt-get update && apt-get install -y --no-install-recommends \ rsync \ && apt-get clean \ @@ -153,7 +156,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY --from=vpipe-final-base ${install_path} ${install_path} # ============================================= -MAINTAINER V-pipe Dev Team +LABEL maintainer="V-pipe Dev Team " VOLUME /work WORKDIR /work diff --git a/tests/regression_tests.sh b/tests/regression_tests.sh index 160427b02..9d9588ef4 100755 --- a/tests/regression_tests.sh +++ b/tests/regression_tests.sh @@ -56,6 +56,7 @@ output: visualization: true diversity: true QA: true + upload: true snv: threads: ${THREADS} diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index e314c0959..209a606d2 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -6,7 +6,9 @@ rule dh_reuse_alignreject: # this rule re-use the rejected reads in align.smk (e.g. ngshmmalign's /alignments/rejects.sam) # (useful when in parallel with the main processing) input: - reject_aln=rules.hmm_align.output.reject_aln if config["general"]["aligner"] == "ngshmmalign" else temp_prefix("{dataset}/alignments/tmp_aln.sam") + reject_aln=rules.hmm_align.output.reject_aln + if config["general"]["aligner"] == "ngshmmalign" + else temp_prefix("{dataset}/alignments/tmp_aln.sam"), output: reject_1=temp_with_prefix("{dataset}/alignments/reject_R1.fastq.gz"), reject_2=temp_with_prefix("{dataset}/alignments/reject_R2.fastq.gz"), @@ -85,8 +87,12 @@ rule dh_hostalign: input: host_ref=config.dehuman["ref_host"], ref_index="{}.bwt".format(config.dehuman["ref_host"]), - reject_1=rules.dh_redo_alignreject.output.reject_1 if config["dehuman"]["catchup"] else rules.dh_reuse_alignreject.output.reject_1, - reject_2=rules.dh_redo_alignreject.output.reject_2 if config["dehuman"]["catchup"] else rules.dh_reuse_alignreject.output.reject_2, + reject_1=rules.dh_redo_alignreject.output.reject_1 + if config["dehuman"]["catchup"] + else rules.dh_reuse_alignreject.output.reject_1, + reject_2=rules.dh_redo_alignreject.output.reject_2 + if config["dehuman"]["catchup"] + else rules.dh_reuse_alignreject.output.reject_2, output: host_aln=temp_with_prefix("{dataset}/alignments/host_aln.sam"), params: From 190cd7d86af74f32febc4bb6b10f3ce83031ae32 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Tue, 22 Feb 2022 22:43:15 +0100 Subject: [PATCH 23/33] Fix consensus and "comment" field in cram - ref_{xxx} fasta files now follows "consensus" property, like consensus.bcftools.fasta - RegEx expanded to cover more corner cases to convert Illumina index "1:N:0:..." into SAM comment "BC:Z:..." - support dual index: ATCG+CGAT - support single index: GATTA - support numerical indicator: 180 --- workflow/rules/dehuman.smk | 2 +- workflow/rules/publish.smk | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 209a606d2..1ec9951e3 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -274,7 +274,7 @@ rule dehuman: # - 'bwa mem' which keep comments in the SAM file verbatim as in the FASTQ file # - 'samtools' which expects comment to be properly marked as 'BC:Z:' # as per SAM format specs - REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:[ATCGN]+(\+[ATCGN]+))$}}{{BC:Z:\\1}}\' + REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:([ATCGN]+(\+[ATCGN]+)?|[[:digit:]]+))$}}{{BC:Z:\\1}}\' FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 perl -p -e ${{REGEXP}} {output.cram_sam} \ diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index f1f62d199..ceb2883b8 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -26,7 +26,8 @@ rule prepare_upload: % bcft_suffix(), consensus_indels_chain="{dataset}/references/consensus%s.bcftools.chain" % bcft_suffix(), - consensus_aligned="{dataset}/references/ref_majority_dels.fasta", # % config.upload["consensus"], + consensus_aligned="{dataset}/references/ref_%s_dels.fasta" + % config.upload["consensus"], csum=[ "{dataset}/references/consensus%(suf)s.bcftools.fasta.%(csum)s" % {"suf": bcft_suffix(), "csum": config.general["checksum"]}, From 01ec3b4d1b8eddb8e131794e7746d8b187f23a27 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 25 Feb 2022 03:26:37 +0100 Subject: [PATCH 24/33] Upload and dehuman tweaks. - compressing unfiltered raw read into cram - use dehuman ressource for larger alignements (host, etc.) - options to exec in demo upload script - typos - unfiltered cram in tests --- Dockerfile | 6 +- tests/regression_tests.sh | 3 + workflow/rules/dehuman.smk | 22 +++--- workflow/rules/publish.smk | 77 ++++++++++++++++++++- workflow/schemas/config_schema.json | 14 +++- workflow/scripts/prepare_upload_symlinks.sh | 16 ++++- 6 files changed, 120 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index 913f6fc54..c6cdaac56 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ WORKDIR /work # configuration: activate all steps RUN mkdir config \ - && printf 'output:\n snv: true\n local: true\n global: true\n visualization: true\n diversity: true\n QA: true\n upload: true' > config/config.yaml + && printf 'output:\n snv: true\n local: true\n global: true\n visualization: true\n diversity: true\n QA: true\n upload: true\nupload:\n orig_cram: true' > config/config.yaml # TODO harmonize list with CI tests and Docker tests RUN for virus in ${virus_download_list:-$(ls ${test_data}/)}; do printf '\n\n\e[36;1mvirus: %s\e[0m\n' "${virus}" \ @@ -86,7 +86,7 @@ ENV virus=hiv WORKDIR /work RUN mkdir config \ - && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true' > config/config.yaml + && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true\nupload:\n orig_cram: true' > config/config.yaml COPY --from=create-envs ${test_data}/${virus} ./samples RUN if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi # NOTE see top comment if `--network=none` breaks build process @@ -108,7 +108,7 @@ ENV virus=sars-cov-2 WORKDIR /work RUN mkdir config \ - && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true' > config/config.yaml + && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true\nupload:\n orig_cram: true' > config/config.yaml COPY --from=create-envs ${test_data}/${virus} ./samples RUN if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi # NOTE see top comment if `--network=none` breaks build process diff --git a/tests/regression_tests.sh b/tests/regression_tests.sh index 9d9588ef4..67fa88da3 100755 --- a/tests/regression_tests.sh +++ b/tests/regression_tests.sh @@ -58,6 +58,9 @@ output: QA: true upload: true +upload: + orig_cram: true + snv: threads: ${THREADS} CONFIG diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 1ec9951e3..090ab4715 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -103,9 +103,9 @@ rule dh_hostalign: "dehuman" resources: disk_mb=1250, - mem_mb=config.bwa_align["mem"], - time_min=config.bwa_align["time"], - threads: config.bwa_align["threads"] + mem_mb=config.dehuman["mem"], + time_min=config.dehuman["time"], + threads: config.dehuman["threads"] shell: # create index if not exists: # test -f {input.ref_index} || {params.BWA} index {input.host_ref} @@ -123,14 +123,15 @@ rule dh_hostalign: rule dh_filter: input: host_aln=temp_prefix("{dataset}/alignments/host_aln.sam"), + # TODO switch to output of rule rule extract R1=partial(raw_data_file, pair=1), R2=partial(raw_data_file, pair=2), output: filter_count="{dataset}/alignments/dehuman.count", filter_list=temp_with_prefix("{dataset}/alignments/dehuman.filter"), # TODO shift to pipe - filtered_1=temp_with_prefix("{dataset}/raw_uploads/filtered_1.fasta.gz"), - filtered_2=temp_with_prefix("{dataset}/raw_uploads/filtered_2.fasta.gz"), + filtered_1=temp_with_prefix("{dataset}/raw_uploads/filtered_1.fastq.gz"), + filtered_2=temp_with_prefix("{dataset}/raw_uploads/filtered_2.fastq.gz"), params: SAMTOOLS=config.applications["samtools"], remove_reads_script=cachepath( @@ -240,8 +241,9 @@ rule dh_filter: rule dehuman: input: global_ref=reference_file, - filtered_1=rules.dh_filter.output.filtered_1, # =temp_prefix("{dataset}/raw_uploads/filtered_1.fasta.gz"), - filtered_2=rules.dh_filter.output.filtered_2, # =temp_prefix("{dataset}/raw_uploads/filtered_2.fasta.gz"), + ref_index="{}.bwt".format(reference_file), + filtered_1=rules.dh_filter.output.filtered_1, # =temp_prefix("{dataset}/raw_uploads/filtered_1.fastq.gz"), + filtered_2=rules.dh_filter.output.filtered_2, # =temp_prefix("{dataset}/raw_uploads/filtered_2.fastq.gz"), output: cram_sam=temp_with_prefix("{dataset}/raw_uploads/dehuman.sam"), final_cram="{dataset}/raw_uploads/dehuman.cram", @@ -256,9 +258,9 @@ rule dehuman: "dehuman" resources: disk_mb=1250, - mem_mb=config.bwa_align["mem"], - time_min=config.bwa_align["time"], - threads: config.bwa_align["threads"] + mem_mb=config.dehuman["mem"], + time_min=config.dehuman["time"], + threads: config.dehuman["threads"] shell: """ echo "Compress filtered sequences --------------------------------------" diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index ceb2883b8..64574d5a9 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -14,8 +14,14 @@ def bcft_suffix(): rule prepare_upload: input: - R1=partial(raw_data_file, pair=1), - R2=partial(raw_data_file, pair=2), + R1=partial(raw_data_file, pair=1) if config.upload["orig_fastq"] else [], + R2=partial(raw_data_file, pair=2) if config.upload["orig_fastq"] else [], + orig_cram=[ + "{dataset}/raw_uploads/raw_reads.cram", + "{dataset}/raw_uploads/raw_reads.cram.%s" % config.general["checksum"], + ] + if config.upload["orig_cram"] + else [], dehuman=[ "{dataset}/raw_uploads/dehuman.cram", "{dataset}/raw_uploads/dehuman.cram.%s" % config.general["checksum"], @@ -56,6 +62,72 @@ rule prepare_upload: """ +rule unfiltered_cram: + input: + global_ref=reference_file, + ref_index="{}.bwt".format(reference_file), + # TODO switch to output of rule rule extract + R1=partial(raw_data_file, pair=1), + R2=partial(raw_data_file, pair=2), + output: + cram_sam=temp_with_prefix("{dataset}/raw_uploads/raw_reads.sam"), + final_cram="{dataset}/raw_uploads/raw_reads.cram", + checksum="{dataset}/raw_uploads/raw_reads.cram.%s" % config.general["checksum"], + params: + BWA=config.applications["bwa"], + SAMTOOLS=config.applications["samtools"], + checksum_type=config.general["checksum"], + conda: + config.dehuman["conda"] + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.bwa_align["threads"] + shell: + """ + # using zcat FILENAME.gz causes issues on Mac, see + # https://serverfault.com/questions/570024/ + # redirection fixes this: + unpack_rawreads() {{ + for fq in "${{@}}"; do + zcat -f < "${{fq}}" + done + }} + + echo "Compress un-filtered sequences -----------------------------------" + echo + + {params.BWA} mem -t {threads} \ + -C \ + -o {output.cram_sam} \ + {input.global_ref} \ + <(unpack_rawreads {input.R1}) \ + <(unpack_rawreads {input.R2}) + + # HACK handle incompatibilities between: + # - Illumina's 'bcl2fastq', which write arbitrary strings + # - 'bwa mem' which keep comments in the SAM file verbatim as in the FASTQ file + # - 'samtools' which expects comment to be properly marked as 'BC:Z:' + # as per SAM format specs + REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:([ATCGN]+(\+[ATCGN]+)?|[[:digit:]]+))$}}{{BC:Z:\\1}}\' + FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + + perl -p -e ${{REGEXP}} {output.cram_sam} \ + | {params.SAMTOOLS} sort -@ {threads} \ + -M \ + --reference {input.global_ref} \ + --output-fmt ${{FMT}} \ + -o {output.final_cram} + + {params.checksum_type}sum {output.final_cram} > {output.checksum} + + echo + echo DONE ------------------------------------------------------------- + echo + """ + + rule checksum: input: "{file}", @@ -76,4 +148,5 @@ rule checksum: """ +ruleorder: unfiltered_cram > checksum ruleorder: dehuman > checksum diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 5065bbff8..be64ecb34 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1256,7 +1256,7 @@ }, "mem": { "type": "integer", - "default": 1250 + "default": 4096 }, "time": { "type": "integer", @@ -1346,6 +1346,18 @@ "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether to whether the new file has changed content or is virtually the same as the previous)", "examples": [ true ] }, + "orig_fastq": { + "type": "boolean", + "default": false, + "description": "Also include the original `.fastq.gz` sequencing reads files from `raw_data/` in the list of files to be uploaded. See propery `orig_cramfs` below for a compressed verion and see output `dehumanized_raw_reads` and section `dehuman` for depleting reads from the host.", + "examples": [ true ] + }, + "orig_cram": { + "type": "boolean", + "default": false, + "description": "Also include the original sequencing raw reads files from `raw_data/` compressed. Similar to property `orig_fastq` above, but with reference-based compression.", + "examples": [ true ] + }, "script": { "type": "string", "default": "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh", diff --git a/workflow/scripts/prepare_upload_symlinks.sh b/workflow/scripts/prepare_upload_symlinks.sh index af34f721e..ea4cdadaa 100755 --- a/workflow/scripts/prepare_upload_symlinks.sh +++ b/workflow/scripts/prepare_upload_symlinks.sh @@ -16,11 +16,13 @@ # - it can optionally be used by the script to store arbitrary data (e.g. json) # -usage() { echo "Usage: $0 [ -h ] [ -n ] [ -- ] [ ... ] +usage() { echo "Usage: $0 [ -h ] [ -n ] [ -e ] [ -- ] [ ... ] options: -n : no random nonce at the end of the global symlinks, they will be not unique + -e : run at the end of this script with the same positionals; + it becomes that script's job to create the output file. -- : end of options, start of positional parameters positional parameters: @@ -35,9 +37,11 @@ for upload. Serves also as a demo for the upload parameters of V-pipe." 1>&2; ex # NOTE it is possible to have named options (e.g. with getops) before the named options begin do_random_nonce=1 -while getopts "nh" o; do +exec_cmd= +while getopts "ne:h" o; do case "${o}" in n) do_random_nonce=0 ;; + e) exec_cmd="${OPTARG}" ;; h) usage 0 ;; *) usage 1 ;; esac @@ -87,5 +91,13 @@ fi ( set -x; ln "-s${force}" "$fixed_uploads" "uploads/$unique_id" ) +# run command if asked to + +if [[ -n "${exec_cmd}" ]]; then + exec ${exec_cmd} "${output}" "${sample_id}" "${sample_dir}" "${to_upload[@]}" + echo "Failed to exec ${exec_cmd}" > /dev/stderr + exit 2 +fi + # create the mandatory output file touch "${output}" From 9492f02da352e68ff113a34d5989b66ed7ccc674 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 25 Feb 2022 11:30:34 +0100 Subject: [PATCH 25/33] tweaks in generation of upload cram - use temp_prefix-ed temporary files for sorting - keep only aligned raw reads in host_aln.cram --- workflow/rules/align.smk | 3 ++- workflow/rules/dehuman.smk | 14 ++++++++++++-- workflow/rules/publish.smk | 2 ++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 9f6c752c6..ddf629e69 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -421,6 +421,7 @@ rule sam2bam: params: SAMTOOLS=config.applications["samtools"], FUNCTIONS=functions, + sort_tmp=temp_prefix("{file}.tmp"), log: outfile="{file}_sam2bam.out.log", errfile="{file}_sam2bam.err.log", @@ -438,7 +439,7 @@ rule sam2bam: shell: """ echo "Writing BAM file" - {params.SAMTOOLS} sort -o "{output.BAM}" "{input}" + {params.SAMTOOLS} sort -T "{params.sort_tmp}" -o "{output.BAM}" "{input}" {params.SAMTOOLS} index "{output.BAM}" """ diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 090ab4715..bf5ee12ba 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -139,6 +139,8 @@ rule dh_filter: ), # TODO shoo out the cats keep_host=int(config.dehuman["keep_host"]), + host_only_sam=temp_with_prefix("{dataset}/alignments/host_only.sam"), + sort_tmp=temp_prefix("{dataset}/alignments/host_sort.tmp"), host_aln_cram="{dataset}/alignments/host_aln.cram", # set to 1 to trigger matches with human genome (used for testing): F=2, @@ -203,14 +205,20 @@ rule dh_filter: echo "Keeping Human-aligned virus' rejects -----------------------------" echo + {params.SAMTOOLS} view -@ {threads} \ + -h -f 2 \ + -o {params.host_only_sam} \ + {input.host_aln} + # (we compress reference-less, because the reference size is larger # than the contaminant reads) FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 {params.SAMTOOLS} sort -@ {threads} \ - -M \ + -T {params.sort_tmp} \ --output-fmt ${{FMT}} \ -o {params.host_aln_cram} \ - {input.host_aln} + {params.host_only_sam} && + rm {params.host_only_sam} echo echo "Compressing human-depleted raw reads -----------------------------" @@ -252,6 +260,7 @@ rule dehuman: BWA=config.applications["bwa"], SAMTOOLS=config.applications["samtools"], checksum_type=config.general["checksum"], + sort_tmp=temp_prefix("{dataset}/raw_uploads/dehuman.tmp"), conda: config.dehuman["conda"] group: @@ -281,6 +290,7 @@ rule dehuman: perl -p -e ${{REGEXP}} {output.cram_sam} \ | {params.SAMTOOLS} sort -@ {threads} \ + -T {params.sort_tmp} \ -M \ --reference {input.global_ref} \ --output-fmt ${{FMT}} \ diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index 64574d5a9..385674b60 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -77,6 +77,7 @@ rule unfiltered_cram: BWA=config.applications["bwa"], SAMTOOLS=config.applications["samtools"], checksum_type=config.general["checksum"], + sort_tmp=temp_prefix("{dataset}/raw_uploads/raw_reads.tmp"), conda: config.dehuman["conda"] resources: @@ -115,6 +116,7 @@ rule unfiltered_cram: perl -p -e ${{REGEXP}} {output.cram_sam} \ | {params.SAMTOOLS} sort -@ {threads} \ + -T {params.sort_tmp} \ -M \ --reference {input.global_ref} \ --output-fmt ${{FMT}} \ From 9ce1b6f9fb4b41ed4f3b0fca8e0853ed97c388e3 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 25 Feb 2022 18:54:28 +0100 Subject: [PATCH 26/33] Plumbing --- workflow/rules/dehuman.smk | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index bf5ee12ba..27a9d2514 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -139,7 +139,6 @@ rule dh_filter: ), # TODO shoo out the cats keep_host=int(config.dehuman["keep_host"]), - host_only_sam=temp_with_prefix("{dataset}/alignments/host_only.sam"), sort_tmp=temp_prefix("{dataset}/alignments/host_sort.tmp"), host_aln_cram="{dataset}/alignments/host_aln.cram", # set to 1 to trigger matches with human genome (used for testing): @@ -168,8 +167,7 @@ rule dh_filter: echo "Count aligned reads ---------------------------------------------" echo - count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {input.host_aln}) - echo "${{count}}" > {output.filter_count} + count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {input.host_aln} | tee {output.filter_count}) if (( count > 0 )); then echo @@ -205,20 +203,17 @@ rule dh_filter: echo "Keeping Human-aligned virus' rejects -----------------------------" echo - {params.SAMTOOLS} view -@ {threads} \ - -h -f 2 \ - -o {params.host_only_sam} \ - {input.host_aln} # (we compress reference-less, because the reference size is larger # than the contaminant reads) FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 - {params.SAMTOOLS} sort -@ {threads} \ + {params.SAMTOOLS} view -@ {threads} \ + -h -f 2 \ + {input.host_aln} \ + | {params.SAMTOOLS} sort -@ {threads} \ -T {params.sort_tmp} \ --output-fmt ${{FMT}} \ - -o {params.host_aln_cram} \ - {params.host_only_sam} && - rm {params.host_only_sam} + -o {params.host_aln_cram} echo echo "Compressing human-depleted raw reads -----------------------------" From 4f932c7e36e5468ea51d04059e7539e6c96715e5 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Sat, 26 Feb 2022 10:28:28 +0100 Subject: [PATCH 27/33] Random nonce disabled by default in upload demo --- workflow/scripts/prepare_upload_symlinks.sh | 25 +++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/workflow/scripts/prepare_upload_symlinks.sh b/workflow/scripts/prepare_upload_symlinks.sh index ea4cdadaa..cf2d2eef6 100755 --- a/workflow/scripts/prepare_upload_symlinks.sh +++ b/workflow/scripts/prepare_upload_symlinks.sh @@ -16,13 +16,25 @@ # - it can optionally be used by the script to store arbitrary data (e.g. json) # +do_random_nonce=0 +exec_cmd= + usage() { echo "Usage: $0 [ -h ] [ -n ] [ -e ] [ -- ] [ ... ] options: - -n : no random nonce at the end of the global symlinks, - they will be not unique + -r : append a random none at the end of the global symlinks, + multiples update of the same sample and date will generate + multiple unique symlinks, one for each update. + -R : no random nonce at the end of the global symlinks, + they will be not unique, a sample and date will always have + a single symlink, no matter how many time it was updated. + [ Default: random nonce are $( if (( do_random_nonce )); then echo "enabled"; else echo "disabled"; fi ) ] + -e : run at the end of this script with the same positionals; - it becomes that script's job to create the output file. + [ ... ] + it becomes that script's job to create the output file, + if successful. + -- : end of options, start of positional parameters positional parameters: @@ -36,11 +48,10 @@ Generates symlinks that help tracking new and updated samples to consider for upload. Serves also as a demo for the upload parameters of V-pipe." 1>&2; exit "$1"; } # NOTE it is possible to have named options (e.g. with getops) before the named options begin -do_random_nonce=1 -exec_cmd= -while getopts "ne:h" o; do +while getopts "rRe:h" o; do case "${o}" in - n) do_random_nonce=0 ;; + r) do_random_nonce=1 ;; + R) do_random_nonce=0 ;; e) exec_cmd="${OPTARG}" ;; h) usage 0 ;; *) usage 1 ;; From 3c7f8ceee0b03e17265281865d4c01de78ae0495 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Sun, 27 Feb 2022 16:52:53 +0100 Subject: [PATCH 28/33] Add the frameshift_deletions_check to the uploads - it is mandatory for some targets like GISAID --- workflow/rules/publish.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index 385674b60..ef2e4f660 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -42,6 +42,9 @@ rule prepare_upload: ] if config.upload["checksum"] else [], + frameshift_deletions_check="{dataset}/references/frameshift_deletions_check.tsv" + if config.output["QA"] + else [], output: upload_prepared_touch="{dataset}/upload_prepared.touch", params: From 67e3e7d214840748d67175ab67f061edce1fe980 Mon Sep 17 00:00:00 2001 From: Kim Date: Thu, 10 Mar 2022 17:29:53 +0100 Subject: [PATCH 29/33] Lots of proof-reading --- workflow/schemas/config_schema.json | 28 ++++++++++----------- workflow/scripts/prepare_upload_symlinks.sh | 10 ++++---- workflow/scripts/report_sequences.py | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index be64ecb34..f9a51f1e1 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -185,7 +185,7 @@ "dehumanized_raw_reads": { "type": "boolean", "default": false, - "description": "This option turns on dehumanization of the raw reads (i.e. removal of host's reads). This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive)", + "description": "This option turns on dehumanization of the raw reads (i.e. removal of host's reads) and generates the file `dehuman.cram`. This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive).\n\nThis only applies to the upload and does not affect the main workflow.", "examples": [true] }, "upload": { @@ -1268,21 +1268,21 @@ }, "ref_host": { "type": "string", - "default": "references/human.fa", + "default": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz", "description": "Host's genome used to remove reads (e.g. human genome)", - "examples": ["/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"] + "examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"] }, "keep_host": { "type": "boolean", "default": false, "description": "Indicate whether to store the host-aligned reads in a CRAM file `…/alignments/host_aln.cram`.", - "examples": [ true ] + "examples": [true] }, "catchup": { "type": "boolean", "default": false, - "description": "Use this option when generating dehumanized raw reads (`dehuman.cram`) on old samples _**that have already**_ been processed in the past - a catch up.\n\nNormally, removing host-mapping reads requires analyzing reads which where rejected by main processing's mapping to the reference (as specified in section `general`, property `aligner`). But this output is considered temporary and will get deleted by Snakemake once the processing of sample has finished. To generate `dehuman.cram` V-pipe would need to run the aligner again, which will both regenerate the data necessary for output this but also generate a new alignment which will trigger again processing every other step that depends on that alignment.\nUse this property `catchup` to only generate the input necessary for `dehuman.cram`, leaving untouched the alignment and everything else that has been already been processed.", - "examples": [ true ] + "description": "Use this option when generating dehumanized raw reads (`dehuman.cram`) on old samples _**that have already**_ been processed in the past - a _catch up_.\n\nNormally, removing host-mapping reads requires analyzing reads which were rejected by V-pipe's main processing (as specified in section `general`, property `aligner`). But this output is considered temporary and will get deleted by Snakemake once the processing of a sample has finished. To generate `dehuman.cram` V-pipe would need to run the aligner again, which will both regenerate the data necessary for this output but also generate a new alignment which will trigger the whole workflow again.\nUse this property `catchup` to only generate the input necessary for `dehuman.cram`, leaving untouched the alignment and everything else that has already been processed.", + "examples": [true] } }, "default": {}, @@ -1331,7 +1331,7 @@ "type": "boolean", "default": true, "description": "Don't dispatch the rule to the cluster for execution, run locally.", - "examples": [ false ] + "examples": [false] }, "consensus": { "type": "string", @@ -1343,20 +1343,20 @@ "checksum": { "type": "boolean", "default": false, - "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether to whether the new file has changed content or is virtually the same as the previous)", - "examples": [ true ] + "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous)", + "examples": [true] }, "orig_fastq": { "type": "boolean", "default": false, - "description": "Also include the original `.fastq.gz` sequencing reads files from `raw_data/` in the list of files to be uploaded. See propery `orig_cramfs` below for a compressed verion and see output `dehumanized_raw_reads` and section `dehuman` for depleting reads from the host.", - "examples": [ true ] + "description": "Also include the original `.fastq.gz` sequencing reads files from `raw_data/` in the list of files to be uploaded. See property `orig_cram` below for a compressed version and see output `dehumanized_raw_reads` and section `dehuman` for depleting reads from the host.", + "examples": [true] }, "orig_cram": { "type": "boolean", "default": false, - "description": "Also include the original sequencing raw reads files from `raw_data/` compressed. Similar to property `orig_fastq` above, but with reference-based compression.", - "examples": [ true ] + "description": "Also include a compressed version of the original sequencing raw reads files from `raw_data/`. Similar to property `orig_fastq` above, but with reference-based compression.", + "examples": [true] }, "script": { "type": "string", @@ -1367,7 +1367,7 @@ "options": { "type": "string", "default": "", - "description": "named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server informations.", + "description": "Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.", "example": ["--config custom_scripts/uploader.yaml --"] } }, diff --git a/workflow/scripts/prepare_upload_symlinks.sh b/workflow/scripts/prepare_upload_symlinks.sh index cf2d2eef6..46ac03e1d 100755 --- a/workflow/scripts/prepare_upload_symlinks.sh +++ b/workflow/scripts/prepare_upload_symlinks.sh @@ -4,8 +4,8 @@ # Example script to prepare and assist uploads. # - it will get called whenever the consensus FASTA files or the CRAM-compressed raw-reads are updated by snakemake # - it will create a per-sample directory '.../uploads/' with symlinks to all uploadable files -# - it will crate a global directory 'uploads' with symlinks to samples that were updated -# - these file aren't tracked by snakemake's DAG +# - it will create a global directory 'uploads' with symlinks to samples that were updated +# - these files aren't tracked by snakemake's DAG # - thus they can be deleted without triggering a re-build by snakemake # - but they will be re-created whenever an input file dependency changes # - it is possible to iteratively scan this global directory between runs of V-pipe to determine @@ -22,11 +22,11 @@ exec_cmd= usage() { echo "Usage: $0 [ -h ] [ -n ] [ -e ] [ -- ] [ ... ] options: - -r : append a random none at the end of the global symlinks, - multiples update of the same sample and date will generate + -r : append a random nonce at the end of the global symlinks, + multiple updates of the same sample and date will generate multiple unique symlinks, one for each update. -R : no random nonce at the end of the global symlinks, - they will be not unique, a sample and date will always have + they will not be unique, a sample and date will always have a single symlink, no matter how many time it was updated. [ Default: random nonce are $( if (( do_random_nonce )); then echo "enabled"; else echo "disabled"; fi ) ] diff --git a/workflow/scripts/report_sequences.py b/workflow/scripts/report_sequences.py index f8d9f975e..a2c0dd2e2 100644 --- a/workflow/scripts/report_sequences.py +++ b/workflow/scripts/report_sequences.py @@ -24,7 +24,7 @@ timestamp = datetime.fromtimestamp(f.stat().st_ctime, tz=LOCAL_TIMEZONE) sequences = [] - dehuamized_raw_reads = f.parent.parent / "raw_data" / "dehuman.cram" + dehuamized_raw_reads = f.parent.parent / "raw_uploads" / "dehuman.cram" if dehuamized_raw_reads.exists(): raw_data_files.append(dehuamized_raw_reads) raw_data = str(dehuamized_raw_reads) From 4188d26a8569e044d7acad4c3487461999d06709 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 4 Mar 2022 12:12:04 +0100 Subject: [PATCH 30/33] Re-use extract's output + text - When _NOT_ in catchup mode, re-use the extracted_data/R?.fastq that is produced as part of the normal processing. (Note: unlike prinseq, the sorting isn't required for BWA) - Change text to "host" instead of "human" or "H. Sapiens" --- workflow/rules/dehuman.smk | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 27a9d2514..a307acd4e 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -110,7 +110,7 @@ rule dh_hostalign: # create index if not exists: # test -f {input.ref_index} || {params.BWA} index {input.host_ref} """ - echo "Checking rejects against Homo Sapiens ---------------------------" + echo "Checking rejects against host's genome --------------------------" {params.BWA} mem -t {threads} \ -o {output.host_aln}\ @@ -123,9 +123,12 @@ rule dh_hostalign: rule dh_filter: input: host_aln=temp_prefix("{dataset}/alignments/host_aln.sam"), - # TODO switch to output of rule rule extract - R1=partial(raw_data_file, pair=1), - R2=partial(raw_data_file, pair=2), + R1=partial(raw_data_file, pair=1) + if config["dehuman"]["catchup"] + else temp_prefix("{dataset}/extracted_data/R1.fastq"), + R2=partial(raw_data_file, pair=2) + if config["dehuman"]["catchup"] + else temp_prefix("{dataset}/extracted_data/R2.fastq"), output: filter_count="{dataset}/alignments/dehuman.count", filter_list=temp_with_prefix("{dataset}/alignments/dehuman.filter"), @@ -175,7 +178,7 @@ rule dh_filter: echo "Needs special care: ${{count}} potential human reads found" echo "-----------------------------------------------------------------" echo - echo "Removing identified human reads from raw reads ------------------" + echo "Removing identified host reads from raw reads -------------------" echo # get list @@ -200,7 +203,7 @@ rule dh_filter: # keep the rejects for further analysis echo - echo "Keeping Human-aligned virus' rejects -----------------------------" + echo "Keeping host-aligned virus' rejects ------------------------------" echo @@ -216,7 +219,7 @@ rule dh_filter: -o {params.host_aln_cram} echo - echo "Compressing human-depleted raw reads -----------------------------" + echo "Compressing host-depleted raw reads ------------------------------" echo {params.SAMTOOLS} index -@ {threads} {params.host_aln_cram} @@ -226,11 +229,8 @@ rule dh_filter: echo "No potential human reads found -----------------------------------" echo "Copy raw reads file" echo - # HACK cheating as currently we only use gziped data - #unpack_rawreads {input.R1} | gzip > {output.filtered_1} & - #unpack_rawreads {input.R2} | gzip > {output.filtered_2} & - cat {input.R1} > {output.filtered_1} & - cat {input.R2} > {output.filtered_2} & + unpack_rawreads {input.R1} | gzip > {output.filtered_1} & + unpack_rawreads {input.R2} | gzip > {output.filtered_2} & wait touch {output.filter_list} fi From adf040fbdd8e2a05a03df6d1b4055f420e74a5da Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 11 Mar 2022 23:22:23 +0100 Subject: [PATCH 31/33] (Partial) support for on-line Host genome fetching - Can use an URL for host genome (Homo Sapiens) - re-organising bwa indexing code to support this - can still crash in various circumstance, considered unstable --- workflow/Snakefile | 2 +- workflow/rules/align.smk | 37 ++++++++++------------------- workflow/rules/clean.smk | 9 ++----- workflow/rules/dehuman.smk | 8 +++---- workflow/rules/publish.smk | 4 ++-- workflow/schemas/config_schema.json | 6 ++--- 6 files changed, 25 insertions(+), 41 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 442bafe83..4ac54a790 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,7 +31,6 @@ rule allfastqc: fastqc_files, -include: "rules/clean.smk" include: "rules/quality_assurance.smk" include: "rules/align.smk" include: "rules/consensus.smk" @@ -39,6 +38,7 @@ include: "rules/mafs.smk" include: "rules/stats.smk" include: "rules/publish.smk" include: "rules/dehuman.smk" +include: "rules/clean.smk" if config.output["snv"] or config.output["local"]: diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index ddf629e69..65a829c2f 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -444,20 +444,24 @@ rule sam2bam: """ +bwa_idx_ext = [".bwt", ".amb", ".ann", ".pac", ".sa"] +bowtie_idx_ext = [".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", ".rev.1.bt2", ".rev.2.bt2"] + + rule ref_bwa_index: input: - reference_file, + "{file}", output: - "{}.bwt".format(reference_file), #all indexing files: .amb .ann .bwt .fai .pac .sa + multiext("{file}", *bwa_idx_ext), params: BWA=config.applications["bwa"], log: - outfile="references/bwa_index.out.log", - errfile="references/bwa_index.err.log", + outfile="{file}_bwa_index.out.log", + errfile="{file}_bwa_index.err.log", conda: config.ref_bwa_index["conda"] benchmark: - "references/ref_bwa_index.benchmark" + "{file}_bwa_index.benchmark" group: "align" resources: @@ -488,7 +492,7 @@ if config.general["aligner"] == "bwa": input: FASTQ=input_align_gz, REF=reference_file, - INDEX="{}.bwt".format(reference_file), + INDEX=multiext(reference_file, *bwa_idx_ext), output: REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), @@ -525,12 +529,7 @@ elif config.general["aligner"] == "bowtie": input: reference_file, output: - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + multiext(reference_file, *bowtie_idx_ext), params: BOWTIE=config.applications["bowtie_idx"], log: @@ -560,12 +559,7 @@ elif config.general["aligner"] == "bowtie": R1="{dataset}/preprocessed_data/R1.fastq.gz", R2="{dataset}/preprocessed_data/R2.fastq.gz", REF=reference_file, - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + INDEX=multiext(reference_file, *bowtie_idx_ext), output: REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), @@ -604,12 +598,7 @@ elif config.general["aligner"] == "bowtie": input: R1="{dataset}/preprocessed_data/R1.fastq.gz", REF=reference_file, - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + INDEX=multiext(reference_file, *bowtie_idx_ext), output: REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), diff --git a/workflow/rules/clean.smk b/workflow/rules/clean.smk index 7d0287df1..0931903f7 100644 --- a/workflow/rules/clean.smk +++ b/workflow/rules/clean.smk @@ -60,7 +60,7 @@ rule alignclean: rule bwaclean: input: - "{}.bwt".format(reference_file), + INDEX=multiext(reference_file, *bwa_idx_ext), params: DIR=config.output["datadir"], shell: @@ -74,12 +74,7 @@ rule bwaclean: rule bowtieclean: input: - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + INDEX=multiext(reference_file, *bowtie_idx_ext), params: DIR=config.output["datadir"], shell: diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index a307acd4e..6174a3843 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -43,7 +43,7 @@ rule dh_redo_alignreject: # (useful if aligner's (temp) rejects have been deleted by now) input: global_ref=reference_file, - ref_index="{}.bwt".format(reference_file), + ref_index=multiext(reference_file, *bwa_idx_ext), fastq=input_align_gz, output: tmp_aln=temp_with_prefix("{dataset}/alignments/dh_aln.sam"), @@ -85,8 +85,8 @@ rule dh_redo_alignreject: rule dh_hostalign: input: - host_ref=config.dehuman["ref_host"], - ref_index="{}.bwt".format(config.dehuman["ref_host"]), + host_ref=cachepath(config.dehuman["ref_host"]), + ref_index=multiext(cachepath(config.dehuman["ref_host"]), *bwa_idx_ext), reject_1=rules.dh_redo_alignreject.output.reject_1 if config["dehuman"]["catchup"] else rules.dh_reuse_alignreject.output.reject_1, @@ -244,7 +244,7 @@ rule dh_filter: rule dehuman: input: global_ref=reference_file, - ref_index="{}.bwt".format(reference_file), + ref_index=multiext(reference_file, *bwa_idx_ext), filtered_1=rules.dh_filter.output.filtered_1, # =temp_prefix("{dataset}/raw_uploads/filtered_1.fastq.gz"), filtered_2=rules.dh_filter.output.filtered_2, # =temp_prefix("{dataset}/raw_uploads/filtered_2.fastq.gz"), output: diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk index ef2e4f660..cbe12e32f 100644 --- a/workflow/rules/publish.smk +++ b/workflow/rules/publish.smk @@ -68,8 +68,8 @@ rule prepare_upload: rule unfiltered_cram: input: global_ref=reference_file, - ref_index="{}.bwt".format(reference_file), - # TODO switch to output of rule rule extract + ref_index=multiext(reference_file, *bwa_idx_ext), + # NOTE not relying on rule extract - if done as a catchup these might not exist R1=partial(raw_data_file, pair=1), R2=partial(raw_data_file, pair=2), output: diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index f9a51f1e1..6e3f57f7c 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1268,9 +1268,9 @@ }, "ref_host": { "type": "string", - "default": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz", - "description": "Host's genome used to remove reads (e.g. human genome)", - "examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"] + "default": "http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", + "description": "Host's genome used to remove reads (e.g. human genome)\n\n**Note:** although directly fetching the reference from the network can work, we _strongly advise_ to use local files (e.g., download the reference locally or use one provided by your cluster).", + "examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz"] }, "keep_host": { "type": "boolean", From b2de33206e5eecb26cb8f9e9b6b415f7821b14ab Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Sat, 12 Mar 2022 10:44:48 +0100 Subject: [PATCH 32/33] Redesigned support for URL to host genome --- workflow/rules/common.smk | 8 ++++++++ workflow/rules/dehuman.smk | 19 +++++++++++++++++-- workflow/schemas/config_schema.json | 10 ++++++++-- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c08b612bd..5eac4171b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -440,6 +440,7 @@ for p in patient_list: visualizations.append(os.path.join(sdir, "visualization/snv_calling.html")) visualizations.append(os.path.join(sdir, "visualization/alignment.html")) + # upload related stuff if config.output["dehumanized_raw_reads"]: dehumanized_raw_reads.append(os.path.join(sdir, "raw_uploads", "dehuman.cram")) @@ -500,6 +501,13 @@ if not VPIPE_BENCH: else: reference_name = get_reference_name(reference_file) + if config.output["dehumanized_raw_reads"] and not os.path.isfile( + config.dehuman["ref_host"] + ): + LOGGER.warning( + f"WARNING: Host organism reference file {config.dehuman['ref_host']} not found, it will be downloaded from {config.dehuman['ref_host_url']}." + ) + # Auxiliary functions diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk index 6174a3843..2c2f0dbc6 100644 --- a/workflow/rules/dehuman.smk +++ b/workflow/rules/dehuman.smk @@ -1,6 +1,10 @@ from functools import partial +localrules: + download_host_ref, + + rule dh_reuse_alignreject: # "rely on aligner's output". # this rule re-use the rejected reads in align.smk (e.g. ngshmmalign's /alignments/rejects.sam) @@ -83,10 +87,21 @@ rule dh_redo_alignreject: """ +rule download_host_ref: + output: + host_ref=config.dehuman["ref_host"], + params: + host_ref_url=config.dehuman["ref_host_url"], + shell: + """ + curl --output "{output.host_ref}" "{params.host_ref_url}" + """ + + rule dh_hostalign: input: - host_ref=cachepath(config.dehuman["ref_host"]), - ref_index=multiext(cachepath(config.dehuman["ref_host"]), *bwa_idx_ext), + host_ref=config.dehuman["ref_host"], + ref_index=multiext(config.dehuman["ref_host"], *bwa_idx_ext), reject_1=rules.dh_redo_alignreject.output.reject_1 if config["dehuman"]["catchup"] else rules.dh_reuse_alignreject.output.reject_1, diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 6e3f57f7c..4ddf01809 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1267,10 +1267,16 @@ "default": 4 }, "ref_host": { + "type": "string", + "default": "references/human.fa.gz", + "description": "Host's genome used to remove reads (e.g. human genome)\n\n**Note:** if this file is absent, it is possible to fetch it from a remote server, see property `ref_host_url` below.", + "examples": ["/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"] + }, + "ref_host_url": { "type": "string", "default": "http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", - "description": "Host's genome used to remove reads (e.g. human genome)\n\n**Note:** although directly fetching the reference from the network can work, we _strongly advise_ to use local files (e.g., download the reference locally or use one provided by your cluster).", - "examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz"] + "description": "If the host's genome specified in property `ref_host` isn't present, fetch it from a remote server.\n\n**Note** remember to set aside enough memory for the indexing rule, see section `ref_bwa_index` property `mem`.", + "examples": [ "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz"] }, "keep_host": { "type": "boolean", From f730d88cb18fbc1e5a164c5d25c7916a32812085 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Sat, 12 Mar 2022 18:40:07 +0100 Subject: [PATCH 33/33] Documentation - small cosmetic fixes (mostly uppercase and final dot) - sync HTML manual --- config/config.html | 78 ++++++++++++++-------- workflow/schemas/config_schema.json | 100 ++++++++++++++-------------- 2 files changed, 101 insertions(+), 77 deletions(-) diff --git a/config/config.html b/config/config.html index 0f98b4d57..ddb2e4864 100644 --- a/config/config.html +++ b/config/config.html @@ -1,10 +1,10 @@ V-pipe configuration

V-pipe configuration

Type: object

The V-pipe workflow can be customized through the configuration file config.yaml or config.json or, for backward compatibility with the legacy INI-style format used in V-pipe v1.x/2.x, vpipe.config. This configuration file is a text file written using a basic structure composed of sections, properties and values. When using YAML or JSON format use these languages associative array/dictionaries in two levels for sections and properties. When using the older INI format, sections are expected in squared brackets, and properties are followed by corresponding values.

Further more, it is possible to specify additional options on the command line using Snakemake’s --configfile to pass additional YAML/JSON configuration files, and/or using Snakemake’s --config to pass sections and properties in a YAML Flow style/JSON syntax.

The order of precedence is:
command line options (--config, --configfile) >> default configuration file (config/config.yaml or config.yaml) >> legacy configuration INI (vpipe.config) >> Virus-specific base config (virus_based_config) >> default values

Example: For instance, we suggest providing as input a tabular file specifying sample unique identifiers (e.g., patient identifiers), and dates for different sequencing runs related to the same patient. The name of this file (here, samples.tsv) can be provided by specifying the section as input and the property as samples_file, as follows in the example below.

In this document, we provide a comprehensive list of all user-configurable options stratified by sections.


Example:

input:
   samples_file: samples.tsv
-

Type: object Default: {}

This section of the configuration provides general options that control the overall behavior of the pipeline

Type: string Default: ""

We provide virus-specific base configuration files which contain handy defaults for, e.g., HIV and SARS-CoV-2. Check the git repository’s config subdirectory to learn about them.


Examples:

hiv
+

Type: object Default: {}

This section of the configuration provides general options that control the overall behavior of the pipeline.

Type: string Default: ""

We provide virus-specific base configuration files which contain handy defaults for, e.g., HIV and SARS-CoV-2. Check the git repository’s config subdirectory to learn about them.


Examples:

hiv
 ...
 
sars-cov-2
 ...
-

Type: enum (of string) Default: "ngshmmalign"

There are three options for mapping reads, either using ngshmmalign, BWA MEM (bwa) 1, or Bowtie 2 (bowtie) 2. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.

Note: Some virus-specific base configuration specified in virus_base_config might change this option’s default to a more appropriate aligner for that virus, e.g, depending on its usual diversity and mutation rate.
You are still free to override that default in your configuration shall the need arise.


  1. Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013. 

  2. Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012. 

Must be one of:

  • "ngshmmalign"
  • "bwa"
  • "bowtie"

Example:

bowtie
+

Type: enum (of string) Default: "ngshmmalign"

There are three options for mapping reads, either using ngshmmalign, BWA MEM (bwa) 1, or Bowtie 2 (bowtie) 2. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.

Note: Some virus-specific base configuration specified in virus_base_config might change this option’s default to a more appropriate aligner for that virus, e.g., depending on its usual diversity and mutation rate.
You are still free to override that default in your configuration shall the need arise.


  1. Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013. 

  2. Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012. 

Must be one of:

  • "ngshmmalign"
  • "bwa"
  • "bowtie"

Example:

bowtie
 ...
 

Type: enum (of string) Default: "shorah"

There are two options available for calling single nucleotide variants, either using ShoRAH (shorah) 1 or LoFreq (lofreq) 2. ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example


  1. Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011. 

  2. Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012. 

Must be one of:

  • "shorah"
  • "lofreq"

Example:

lofreq
 ...
@@ -12,11 +12,13 @@
 ...
 

Type: integer Default: 1

This option should be used to specify the default number of threads for all multi-threaded rules. That is, unless the number of threads is specified for each rule, this value is set as default.


Example:

4
 ...
+

Type: enum (of string) Default: "md5"

Sets the algorithm to be used when computing checksums for uploadable data.

Must be one of:

  • "md5"
  • "sha1"
  • "sha256"
  • "sha224"
  • "sha384"
  • "sha512"
  • "xxh64"
  • "xxh32"
  • "xxh128"

Example:

sha256
+...
 

Type: string Default: ""

Some step of V-pipe produce temporary files such as, e.g., decompressed intermediate - i.e. files which aren’t kept long-term but are deleted after all steps that needed them have finished. By default, these files are written in the output data directory. This option, makes it is possible to write them in a different directory instead. Use this option to, e.g., leverage a faster cluster-local storage or avoid wasting backup space on a snapshotted storage. You might want to consult the documentation provided by your HPC.


Examples:

temp
 ...
 
/cluster/scratch
 ...
-

Type: object Default: {}

Properties in this section of the configuration control the input of the pipeline

Type: string Default: "samples/"

The input file for the workflow will be searched in this directory.

V-pipe expects the input samples to be organized in a two-level directory hierarchy.

  • The first level can be, e.g., patient samples or biological replicates of an experiment.
  • The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.
  • Inside that directory, the sub-directory raw_data holds the sequencing data in FASTQ format (optionally compressed with GZip).

For example:

samples
+

Type: object Default: {}

Properties in this section of the configuration control the input of the pipeline.

Type: string Default: "samples/"

The input file for the workflow will be searched in this directory.

V-pipe expects the input samples to be organized in a two-level directory hierarchy.

  • The first level can be, e.g., patient samples or biological replicates of an experiment.
  • The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.
  • Inside that directory, the sub-directory raw_data holds the sequencing data in FASTQ format (optionally compressed with GZip).

For example:

samples
 ├── patient1
 │   ├── 20100113
 │   │   └──raw_data
@@ -47,7 +49,7 @@
 

Type: string Default: "config/samples.tsv"

File containing sample unique identifiers and dates as tab-separated values, e.g.,

patient1    20100113
 patient1    20110202
 patient2    20081130
-

Here, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.

Optionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.

Standardized Snakemake workflows place their tables inside the config/ subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x)


Example:

samples.tsv
+

Here, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.

Optionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.

Standardized Snakemake workflows place their tables inside the config/ subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x).


Example:

samples.tsv
 ...
 

Type: integer Default: 250

Default for those samples whose read length isn’t specified explicitly in the optional third column of the samples.tsv table.


Example:

'100'
 

Type: number Default: 0.8

Using this parameter, the user can specify the read-length threshold that should be applied during the quality trimming as a percentage (0 < trim_percent_cutoff < 1).


Example:

0.9
@@ -68,7 +70,7 @@
 ...
 

Type: string Default: ""

A FASTQ file with sequences of interest

Note: These sequences are used, together with the consensus sequence, to build a phylogenetic tree.


Example:

resources/sars-cov-2/phylogeny/selected_covid_sequences.fasta
 ...
-

Type: object Default: {}

Properties in this section of the configuration control the output of the pipeline

Type: string Default: "results"

The workflow will write its output files into this directory. This will follow the same structure as for the input.

For each sample, V-pipe produces several output files that are located in the corresponding sample-specific directory. First, the alignment file and consensus sequences are located in the alignments and references subdirectories, respectively. Second, output files containing SNVs and viral haplotypes are located in the variants subdirectories.

Using the sample example as in the input section, the output files for the two patient samples will be located in the following subdirectories:

results
+

Type: object Default: {}

Properties in this section of the configuration control the output of the pipeline.

Type: string Default: "results"

The workflow will write its output files into this directory. This will follow the same structure as for the input.

For each sample, V-pipe produces several output files that are located in the corresponding sample-specific directory. First, the alignment file and consensus sequences are located in the alignments and references subdirectories, respectively. Second, output files containing SNVs and viral haplotypes are located in the variants subdirectories.

Using the sample example as in the input section, the output files for the two patient samples will be located in the following subdirectories:

results
 ├──patient1
 │  ├──20100113
 │  │  ├──alignments
@@ -115,51 +117,73 @@
 │  ├──20100113
 │  │  ├──alignments
 
-

If you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the datadirs, you can use this options you can specify alternate subdirectory relative to the datadir property. (Use .. prefix if you want instead your cohort-wide results to be in a directory at the sample level as samples/ and results/. See the example below to recreate the variants/ directory used by legacy V-pipe v1.x/2.x)


Example:

../variants
+

If you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the datadirs, you can use this options you can specify alternate subdirectory relative to the datadir property. (Use .. prefix if you want instead your cohort-wide results to be in a directory at the sample level as samples/ and results/. See the example below to recreate the variants/ directory used by legacy V-pipe v1.x/2.x).


Example:

../variants
 ...
-

Type: boolean Default: false

V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample’s consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample’s …/references/frameshift_deletions_check.tsv. Such reports can be useful when submitting sequences to GISAID.

This option turns on such QA features


Example:

true
+

Type: boolean Default: false

V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample’s consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample’s …/references/frameshift_deletions_check.tsv. Such reports can be useful when submitting sequences to GISAID.

This option turns on such QA features.


Example:

true
 ...
-

Type: boolean Default: false

This option selects whether the SNV caller step should be executed and its output written to each sample’s …/variants/SNVs/snvs.csv


Example:

true
+

Type: boolean Default: false

This option selects whether the SNV caller step should be executed and its output written to each sample’s …/variants/SNVs/snvs.csv.


Example:

true
 ...
-

Type: boolean Default: false

This option activates local haplotype reconstruction (only available when using ShoRAH)


Example:

true
+

Type: boolean Default: false

This option activates local haplotype reconstruction (only available when using ShoRAH).


Example:

true
 ...
-

Type: boolean Default: false

This option turns on global haplotype reconstruction


Example:

true
+

Type: boolean Default: false

This option turns on global haplotype reconstruction.


Example:

true
 ...
-

Type: boolean Default: false

This option selects whether to generate HTML visualization of the SNVs in each sample’s …/visualization/index.html


Example:

true
+

Type: boolean Default: false

This option selects whether to generate HTML visualization of the SNVs in each sample’s …/visualization/index.html.


Example:

true
 ...
 

Type: boolean Default: false

This option turns on the computation of diversity measures in each sample.


Example:

true
 ...
-

Type: object Default: {}

The path to the different software packages can be specified using this section.

It is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the PATH.

Note we strongly recommend to use conda environments, by adding the --use-conda flag to the V-pipe execution command, e.g. ./vpipe --use-conda. If you prefer to use your own installations, this section allows you to specify the location of the executables


Example:

bwa: /path/to/bwa
+

Type: boolean Default: false

This option turns on dehumanization of the raw reads (i.e. removal of host’s reads) and generates the file dehuman.cram. This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive).

This only applies to the upload and does not affect the main workflow.


Example:

true
+...
+

Type: boolean Default: false

This option can be used for assistance in incremental upload of data. See section upload for an example.


Example:

true
+...
+

Type: object Default: {}

The path to the different software packages can be specified using this section.

It is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the PATH.

Note we strongly recommend to use conda environments, by adding the --use-conda flag to the V-pipe execution command, e.g. ./vpipe --use-conda. If you prefer to use your own installations, this section allows you to specify the location of the executables.


Example:

bwa: /path/to/bwa
 haploclique: /path/to/haploclique
-

Type: string Default: "gunzip"

Type: string Default: "prinseq-lite.pl"

Type: string Default: "fastqc"

Type: string Default: "vicuna"

Due to a special license, VICUNA is not available from bioconda and must be installed from its original website.
Use this option to specify where you have installed its executable

Type: string Default: "InDelFixer"

Type: string Default: "ConsensusFixer"

Type: string Default: "picard"

Type: string Default: "bwa"

Type: string Default: "bowtie2-build"

Type: string Default: "bowtie2"

Type: string Default: "samtools"

Type: string Default: "extract_consensus"

Type: string Default: "matcher"

Type: string Default: "frameshift_deletions_checks"

Type: string Default: "mafft"

Type: string Default: "ngshmmalign"

Type: string Default: "convert_reference"

Type: string Default: "extract_seq"

Type: string Default: "coverage_stats"

Type: string Default: "remove_gaps_msa"

Type: string Default: "aln2basecnt"

Type: string Default: "gather_coverage"

Type: string Default: "minority_freq"

Type: string Default: "extract_coverage_intervals"

Type: string Default: "shorah shotgun"

Type: string Default: "lofreq"

Type: string Default: "bcftools"

Type: string Default: "haploclique"

Type: string Default: "compute_mds"

Type: string Default: "savage"

Type: string Default: "predicthaplo"

Type: object Default: {}

Type: integer Default: 32

Type: integer Default: 60

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 20

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/preprocessing.yaml"

Type: string Default: "-ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10"

We use software PRINSEQ 1 for quality control. By default, we use options -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10, which indicates to trim reads using a sliding window with size 10 bp, and trim bases if their quality scores are less than 30. Additionally, reads are filtered out if the average quality score is below 30 and if they contain more than 4 N’s. The user can choose to overwrite the default settings or use additional parameters by using the property extra. E.g., if many reads are filtered out in this step, the user can choose to lower the quality threshold as indicated in the example.
Please do not modify PRINSEQ options -out_format, -out_good, nor -min_len. Instead of using -min_len to define threshold on the read length after trimming, use input => trim_percent_cutoff.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 


Example:

-ns_max_n 4 -min_qual_mean 20 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_window
+

Type: string Default: "gunzip"

Type: string Default: "prinseq-lite.pl"

Type: string Default: "fastqc"

Type: string Default: "vicuna"

Due to a special license, VICUNA is not available from bioconda and must be installed from its original website.
Use this option to specify where you have installed its executable.

Type: string Default: "InDelFixer"

Type: string Default: "ConsensusFixer"

Type: string Default: "picard"

Type: string Default: "bwa"

Type: string Default: "bowtie2-build"

Type: string Default: "bowtie2"

Type: string Default: "samtools"

Type: string Default: "extract_consensus"

Type: string Default: "matcher"

Type: string Default: "frameshift_deletions_checks"

Type: string Default: "mafft"

Type: string Default: "ngshmmalign"

Type: string Default: "convert_reference"

Type: string Default: "extract_seq"

Type: string Default: "coverage_stats"

Type: string Default: "remove_gaps_msa"

Type: string Default: "aln2basecnt"

Type: string Default: "gather_coverage"

Type: string Default: "minority_freq"

Type: string Default: "extract_coverage_intervals"

Type: string Default: "shorah shotgun"

Type: string Default: "lofreq"

Type: string Default: "bcftools"

Type: string Default: "haploclique"

Type: string Default: "compute_mds"

Type: string Default: "savage"

Type: string Default: "predicthaplo"

Type: object Default: {}

Type: integer Default: 32

Type: integer Default: 60

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 20

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/preprocessing.yaml"

Type: string Default: "-ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10"

We use software PRINSEQ 1 for quality control. By default, we use options -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10, which indicates to trim reads using a sliding window with size 10 bp, and trim bases if their quality scores are less than 30. Additionally, reads are filtered out if the average quality score is below 30 and if they contain more than 4 N’s. The user can choose to overwrite the default settings or use additional parameters by using the property extra. E.g., if many reads are filtered out in this step, the user can choose to lower the quality threshold as indicated in the example.
Please do not modify PRINSEQ options -out_format, -out_good, nor -min_len. Instead of using -min_len to define threshold on the read length after trimming, use input => trim_percent_cutoff.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 


Example:

-ns_max_n 4 -min_qual_mean 20 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_window
   10
 ...
-

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/fastqc.yaml"

Type: integer Default: 6

Type: boolean Default: false

Type: object Default: {}

NOTE The conda environment for this rule doesn’t work properly. The package on the bioconda channel, mvicuna, is slightly different from VICUNA and it has different command-line arguments. Moreover, VICUNA and mvicuna are no longer maintained. In the future, this rule will be deprecated.

Type: integer Default: 1000

Type: integer Default: 600

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna.yaml"

Type: object Default: {}

NOTE Obtaining a initial reference de novo is implemented for more than one sample.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna_msa.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/hmm_align.yaml"

Type: boolean Default: false

this option is useful for debugging purposes


Example:

true
+

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/fastqc.yaml"

Type: integer Default: 6

Type: boolean Default: false

Type: object Default: {}

NOTE The conda environment for this rule doesn’t work properly. The package on the bioconda channel, mvicuna, is slightly different from VICUNA and it has different command-line arguments. Moreover, VICUNA and mvicuna are no longer maintained. In the future, this rule will be deprecated.

Type: integer Default: 1000

Type: integer Default: 600

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna.yaml"

Type: object Default: {}

NOTE Obtaining a initial reference de novo is implemented for more than one sample.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna_msa.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/hmm_align.yaml"

Type: boolean Default: false

This option is useful for debugging purposes.


Example:

true
 ...
-

Type: string Default: ""

pass additional options to run ngshmmalign

V-pipe uses option -R <path/to/initial_reference>, thus option -r arg is not allowed. Also, instead of passing -l via the property extra, set leave_msa_temp to True. Lastly, please do not modify options -o arg, -w arg, -t arg, and -N arg. These are already managed by V-pipe.

Type: object Default: {}

Type: integer Default: 5000

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_QA.yaml"

Type: string Default: ""

Panel of diverse references against which to align reads as a QA step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/hiv/5-Virus-Mix.fasta
+

Type: string Default: ""

Pass additional options to run ngshmmalign

V-pipe uses option -R <path/to/initial_reference>, thus option -r arg is not allowed. Also, instead of passing -l via the property extra, set leave_msa_temp to True. Lastly, please do not modify options -o arg, -w arg, -t arg, and -N arg. These are already managed by V-pipe.

Type: object Default: {}

Type: integer Default: 5000

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_QA.yaml"

Type: string Default: ""

Panel of diverse references against which to align reads as a QA step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/hiv/5-Virus-Mix.fasta
 ...
-

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: "HXB2:6614-6812,7109-7217,7376-7478,7601-7634"

Type: object Default: {}

This rule takes all previously aligned reads by hmm_align. Therefore, resources should be allocated accordingly.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/msa.yaml"

Type: object Default: {}

Type: integer Default: 8000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: enum (of string) Default: "--phred33"

indicate if qualities are Phred+33 (default) or Phred+64 (--phred64)

Must be one of:

  • "--phred33"
  • "--phred64"

Example:

--phred64
+

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: "HXB2:6614-6812,7109-7217,7376-7478,7601-7634"

Type: object Default: {}

This rule takes all previously aligned reads by hmm_align. Therefore, resources should be allocated accordingly.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/msa.yaml"

Type: object Default: {}

Type: integer Default: 8000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: enum (of string) Default: "--phred33"

Indicate if qualities are Phred+33 (default) or Phred+64 (--phred64).

Must be one of:

  • "--phred33"
  • "--phred64"

Example:

--phred64
 ...
-

Type: string Default: "--local --sensitive-local"

specify Bowtie 2 presets

Type: integer

Type: string Default: ""

pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options
For more details on Bowtie 2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 50

minimum read depth for reporting variants per locus

Type: integer Default: 5

read count below which ambiguous base ’n’ is reported

Type: integer Default: 15

minimum phred quality score for a base to be included

Type: number Default: 0.05

minimum frequency for an ambiguous nucleotide

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bcftools.yaml"

Type: integer Default: 10000

Type: integer Default: 10

Type: number Default: 0.05

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/consseq_qa.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 100

minimum read depth for reporting variants per locus


Example:

50
+

Type: string Default: "--local --sensitive-local"

Specify Bowtie 2 presets.

Type: integer

Type: string Default: ""

Pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options
For more details on Bowtie 2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 50

Minimum read depth for reporting variants per locus.

Type: integer Default: 5

Read count below which ambiguous base ’n’ is reported.

Type: integer Default: 15

Minimum phred quality score for a base to be included.

Type: number Default: 0.05

Minimum frequency for an ambiguous nucleotide.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bcftools.yaml"

Type: integer Default: 10000

Type: integer Default: 10

Type: number Default: 0.05

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/consseq_qa.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 100

Minimum read depth for reporting variants per locus.


Example:

50
 ...
-

Type: boolean Default: false

output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples


Example:

true
+

Type: boolean Default: false

Output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples.


Example:

true
 ...
-

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: boolean Default: false

construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth


Example:

true
+

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: boolean Default: false

Construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth.


Example:

true
 ...
-

Type: integer Default: 50

minimum read depth. A region spanning the reference genome is returned if coverage is set to 0


Example:

0
+

Type: integer Default: 50

Minimum read depth. A region spanning the reference genome is returned if coverage is set to 0.


Example:

0
 ...
-

Type: boolean Default: true

indicate whether to apply a more liberal shifting on intervals’ right-endpoint


Example:

false
+

Type: boolean Default: true

Indicate whether to apply a more liberal shifting on intervals’ right-endpoint.


Example:

false
 ...
-

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 2880

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/snv.yaml"

Type: boolean Default: true

indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False


Example:

false
+

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 2880

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/snv.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

false
 ...
-

Type: number Default: 0.1

hyperparameter used for instantiating a new cluster

Type: boolean Default: false

ignore SNVs adjacent to indels

Type: number Default: 0.9

Type: integer Default: 0

omit windows with coverage less than this value


Example:

50
+

Type: number Default: 0.1

Hyperparameter used for instantiating a new cluster.

Type: boolean Default: false

Ignore SNVs adjacent to indels.

Type: number Default: 0.9

Type: integer Default: 0

Omit windows with coverage less than this value.


Example:

50
 ...
-

Type: integer Default: 3

ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows

Type: boolean Default: false

indicate whether to move files produced in previous/interrupted runs to subdirectory named old


Example:

true
+

Type: integer Default: 3

ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows.

Type: boolean Default: false

Indicate whether to move files produced in previous/interrupted runs to subdirectory named old


Example:

true
 ...
-

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 20

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: boolean Default: true

indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False


Example:

false
+

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 20

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

false
 ...
-

Type: string Default: ""

pass additional options to run lofreq call

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 5

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

NOTE This rule only works in Linux.

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/savage.yaml"

Type: integer Default: 20

size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/split < 1000

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: string Default: "{VPIPE_BASEDIR}/envs/haploclique.yaml"

Type: boolean Default: true

if set to True (default) a predefined set of parameter values is used for drawing edges between reads in the read graph

Type: boolean Default: true

singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to True, singletons are discarded

Type: boolean Default: true

if set to True (default) probability of the overhangs is ignored

Type: integer Default: 3

sets a threshold to limit the size of cliques

Type: integer Default: 10000

indicates the maximum number of clique to be considered in the next iteration

Type: string Default: ""

additional parameters to be passed to haploclique.

Warning: this won’t overwrite the other options (e.g. clique_size_limi and max_num_cliques should still be set via their own respective properties, do not pass parameters --limit_clique_size= nor --max_cliques= via this extra property).

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 0

use to specify a region of interest

Type: integer Default: -1

use to specify a region of interest


Examples:

9719
+

Type: string Default: ""

Pass additional options to run lofreq call

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 5

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

NOTE This rule only works in Linux.

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/savage.yaml"

Type: integer Default: 20

Size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/split < 1000.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: string Default: "{VPIPE_BASEDIR}/envs/haploclique.yaml"

Type: boolean Default: true

If set to True (default) a predefined set of parameter values is used for drawing edges between reads in the read graph.

Type: boolean Default: true

Singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to True, singletons are discarded.

Type: boolean Default: true

If set to True (default) probability of the overhangs is ignored.

Type: integer Default: 3

Sets a threshold to limit the size of cliques.

Type: integer Default: 10000

Indicates the maximum number of clique to be considered in the next iteration.

Type: string Default: ""

Additional parameters to be passed to haploclique.

Warning: this won’t overwrite the other options (e.g. clique_size_limi and max_num_cliques should still be set via their own respective properties, do not pass parameters --limit_clique_size= nor --max_cliques= via this extra property).

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 0

Use to specify a region of interest.

Type: integer Default: -1

Use to specify a region of interest.


Examples:

9719
 ...
 
29836
 ...
-

Type: string Default: ""

when the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: integer Default: 0

Type: string Default: "{VPIPE_BASEDIR}/envs/predicthaplo.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/visualization.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/diversity_measures.yaml"
\ No newline at end of file +

Type: string Default: ""

When the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: integer Default: 0

Type: string Default: "{VPIPE_BASEDIR}/envs/predicthaplo.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/visualization.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/diversity_measures.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/dehuman.yaml"

Type: integer Default: 4096

Type: integer Default: 235

Type: integer Default: 4

Type: string Default: "references/human.fa.gz"

Host’s genome used to remove reads (e.g. human genome)

Note: if this file is absent, it is possible to fetch it from a remote server, see property ref_host_url below.


Example:

/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa
+...
+

Type: string Default: "http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"

If the host’s genome specified in property ref_host isn’t present, fetch it from a remote server.

Note remember to set aside enough memory for the indexing rule, see section ref_bwa_index property mem.


Example:

https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz
+...
+

Type: boolean Default: false

Indicate whether to store the host-aligned reads in a CRAM file …/alignments/host_aln.cram.


Example:

true
+...
+

Type: boolean Default: false

Use this option when generating dehumanized raw reads (dehuman.cram) on old samples that have already been processed in the past - a catch up.

Normally, removing host-mapping reads requires analyzing reads which were rejected by V-pipe’s main processing (as specified in section general, property aligner). But this output is considered temporary and will get deleted by Snakemake once the processing of a sample has finished. To generate dehuman.cram V-pipe would need to run the aligner again, which will both regenerate the data necessary for this output but also generate a new alignment which will trigger the whole workflow again.
Use this property catchup to only generate the input necessary for dehuman.cram, leaving untouched the alignment and everything else that has already been processed.


Example:

true
+...
+

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

Type: object Default: {}

This section is used to assist and prepare uploads of the data, e.g. to European Nucleotide Archive. By default it calls a script that creates symlinks making it easy to identify new/updated samples between calls of V-pipe. But by using the properties script and options and adapting the environment provided in property conda, it is possible to heavily customize the actions (e.g. it is possible to upload to an SFTP server by calling sftp from a modified script). For inspiration, see the default script prepare_upload_symlinks.sh.

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

The default environment only provides hashing functions (xxhash, linux coreutils’ sha{nnn}sum collection, etc.) but depending on your needs you would want to provide a custom environment with additional tools (e.g. sftp, rsync, curl, lftp, custom specialized cloud uploaders, etc.)

Type: integer Default: 1

Type: boolean Default: true

Don’t dispatch the rule to the cluster for execution, run locally.


Example:

false
+...
+

Type: enum (of string) Default: "ambig"

When preparing data for upload, specifies which consensus sequence should be uploaded.

Must be one of:

  • "ambig"
  • "majority"

Example:

majority
+...
+

Type: boolean Default: false

Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous).


Example:

true
+...
+

Type: boolean Default: false

Also include the original .fastq.gz sequencing reads files from raw_data/ in the list of files to be uploaded. See property orig_cram below for a compressed version and see output dehumanized_raw_reads and section dehuman for depleting reads from the host.


Example:

true
+...
+

Type: boolean Default: false

Also include a compressed version of the original sequencing raw reads files from raw_data/. Similar to property orig_fastq above, but with reference-based compression.


Example:

true
+...
+

Type: string Default: "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh"

Custom script that assists and prepares uploads.

It will receive the following positional parameters:

  • : the output file that must be created by the script.
  • : a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.
  • : the base directory of the sample.
  • …: a list of files to consider for upload

For an example, see the default script prepare_upload_symlinks.sh, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.

Type: string Default: ""

Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.

\ No newline at end of file diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 4ddf01809..d82c4c3de 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -6,7 +6,7 @@ "examples": [{"input":{"samples_file":"samples.tsv"}}], "properties": { "general": { - "description": "This section of the configuration provides general options that control the overall behavior of the pipeline", + "description": "This section of the configuration provides general options that control the overall behavior of the pipeline.", "properties": { "virus_base_config": { "type": "string", @@ -18,7 +18,7 @@ "type": "string", "enum": ["ngshmmalign","bwa","bowtie"], "default": "ngshmmalign", - "description": "There are three options for mapping reads, either using [`ngshmmalign`](https://github.com/cbg-ethz/ngshmmalign), [BWA MEM (`bwa`)](https://github.com/lh3/bwa) [^1], or [Bowtie 2 (`bowtie`)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) [^2]. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.\n\n[^1]: Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013.\n[^2]: Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012.\n\n**Note**: Some virus-specific base configuration specified in `virus_base_config` might change this option's default to a more appropriate aligner for that virus, e.g, depending on its usual diversity and mutation rate.\nYou are still free to override that default in your configuration shall the need arise.", + "description": "There are three options for mapping reads, either using [`ngshmmalign`](https://github.com/cbg-ethz/ngshmmalign), [BWA MEM (`bwa`)](https://github.com/lh3/bwa) [^1], or [Bowtie 2 (`bowtie`)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) [^2]. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.\n\n[^1]: Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013.\n[^2]: Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012.\n\n**Note**: Some virus-specific base configuration specified in `virus_base_config` might change this option's default to a more appropriate aligner for that virus, e.g., depending on its usual diversity and mutation rate.\nYou are still free to override that default in your configuration shall the need arise.", "examples": ["bowtie"] }, "snv_caller": { @@ -59,18 +59,18 @@ "type": "object" }, "input": { - "description": "Properties in this section of the configuration control the input of the pipeline", + "description": "Properties in this section of the configuration control the input of the pipeline.", "properties": { "datadir": { "type": "string", "default": "samples/", - "description": "The input file for the workflow will be searched in this directory.\n\nV-pipe expects the input samples to be organized in a two-level directory hierarchy.\n\n - The first level can be, e.g., patient samples or biological replicates of an experiment.\n - The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.\n - Inside that directory, the sub-directory `raw_data` holds the sequencing data in FASTQ format (optionally compressed with GZip).\n\nFor example:\n```console\nsamples\n├── patient1\n│ ├── 20100113\n│ │ └──raw_data\n│ │ ├──patient1_20100113_R1.fastq\n│ │ └──patient1_20100113_R2.fastq\n│ └── 20110202\n│ └──raw_data\n│ ├──patient1_20100202_R1.fastq\n│ └──patient1_20100202_R2.fastq\n└── patient2\n └── 20081130\n └──raw_data\n ├──patient2_20081130_R1.fastq.gz\n └──patient2_20081130_R2.fastq.gz\n```\n\n", + "description": "The input file for the workflow will be searched in this directory.\n\nV-pipe expects the input samples to be organized in a two-level directory hierarchy.\n\n - The first level can be, e.g., patient samples or biological replicates of an experiment.\n - The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.\n - Inside that directory, the sub-directory `raw_data` holds the sequencing data in FASTQ format (optionally compressed with GZip).\n\nFor example:\n```console\nsamples\n├── patient1\n│ ├── 20100113\n│ │ └──raw_data\n│ │ ├──patient1_20100113_R1.fastq\n│ │ └──patient1_20100113_R2.fastq\n│ └── 20110202\n│ └──raw_data\n│ ├──patient1_20100202_R1.fastq\n│ └──patient1_20100202_R2.fastq\n└── patient2\n └── 20081130\n └──raw_data\n ├──patient2_20081130_R1.fastq.gz\n └──patient2_20081130_R2.fastq.gz\n```", "examples": [ "tests/data/hiv/", "tests/data/sars-cov-2/" ] }, "paired": { "type": "boolean", "default": true, - "description": "Indicate whether the input sequencing reads correspond to paired-end reads.\n\nPaired-ended reads need to be in split files with `_R1` and `_R2` suffixes:\n```console\nraw_data\n├──patient2_20081130_R1.fastq.gz\n└──patient2_20081130_R2.fastq.gz\n```\n\n", + "description": "Indicate whether the input sequencing reads correspond to paired-end reads.\n\nPaired-ended reads need to be in split files with `_R1` and `_R2` suffixes:\n```console\nraw_data\n├──patient2_20081130_R1.fastq.gz\n└──patient2_20081130_R2.fastq.gz\n```", "examples": [ false ] }, "fastq_suffix": { @@ -82,7 +82,7 @@ "samples_file": { "type": "string", "default": "config/samples.tsv", - "description": "File containing sample unique identifiers and dates as tab-separated values, e.g.,\n```tsv\npatient1 20100113\npatient1 20110202\npatient2 20081130\n```\n\nHere, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.\n\nOptionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.\n\nStandardized Snakemake workflows place their tables inside the `config/` subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x)\n", + "description": "File containing sample unique identifiers and dates as tab-separated values, e.g.,\n```tsv\npatient1 20100113\npatient1 20110202\npatient2 20081130\n```\n\nHere, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.\n\nOptionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.\n\nStandardized Snakemake workflows place their tables inside the `config/` subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x).", "examples": [ "samples.tsv" ] }, "read_length": { @@ -132,7 +132,7 @@ "type": "object" }, "output": { - "description": "Properties in this section of the configuration control the output of the pipeline", + "description": "Properties in this section of the configuration control the output of the pipeline.", "properties": { "datadir": { "type": "string", @@ -143,37 +143,37 @@ "cohortdir": { "type": "string", "default": "", - "description": "In addition, V-pipe can optionally generate a few cohort-wide results, such as a current cohort consensus fasta file, or a TSV file containing the frequencies of all minor alleles that differ from the consensus among analyzed samples.\nBy default, these output files are located at the base of the [`output` `datadir`](#output_datadir), outside of the two-level per sample structure:\n```console\nresults\n├──minority_variants.tsv\n├──cohort_consensus.fasta\n├──patient1\n│ ├──20100113\n│ │ ├──alignments\n…\n```\n\nIf you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the `datadir`s, you can use this options you can specify alternate subdirectory relative to the `datadir` property. (Use `..` prefix if you want instead your cohort-wide results to be in a directory at the sample level as `samples/` and `results/`. See the example below to recreate the `variants/` directory used by legacy V-pipe v1.x/2.x)", + "description": "In addition, V-pipe can optionally generate a few cohort-wide results, such as a current cohort consensus fasta file, or a TSV file containing the frequencies of all minor alleles that differ from the consensus among analyzed samples.\nBy default, these output files are located at the base of the [`output` `datadir`](#output_datadir), outside of the two-level per sample structure:\n```console\nresults\n├──minority_variants.tsv\n├──cohort_consensus.fasta\n├──patient1\n│ ├──20100113\n│ │ ├──alignments\n…\n```\n\nIf you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the `datadir`s, you can use this options you can specify alternate subdirectory relative to the `datadir` property. (Use `..` prefix if you want instead your cohort-wide results to be in a directory at the sample level as `samples/` and `results/`. See the example below to recreate the `variants/` directory used by legacy V-pipe v1.x/2.x).", "examples": ["../variants"] }, "QA": { "type": "boolean", "default": false, - "description": "V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample's consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample's `…/references/frameshift_deletions_check.tsv`. Such reports can be useful when submitting sequences to GISAID.\n\nThis option turns on such QA features", + "description": "V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample's consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample's `…/references/frameshift_deletions_check.tsv`. Such reports can be useful when submitting sequences to GISAID.\n\nThis option turns on such QA features.", "examples": [true] }, "snv": { "type": "boolean", "default": false, - "description": "This option selects whether the SNV caller step should be executed and its output written to each sample's `…/variants/SNVs/snvs.csv`", + "description": "This option selects whether the SNV caller step should be executed and its output written to each sample's `…/variants/SNVs/snvs.csv`.", "examples": [true] }, "local": { "type": "boolean", "default": false, - "description": "This option activates local haplotype reconstruction (only available when using ShoRAH)", + "description": "This option activates local haplotype reconstruction (only available when using ShoRAH).", "examples": [true] }, "global": { "type": "boolean", "default": false, - "description": "This option turns on global haplotype reconstruction", + "description": "This option turns on global haplotype reconstruction.", "examples": [true] }, "visualization": { "type": "boolean", "default": false, - "description": "This option selects whether to generate HTML visualization of the SNVs in each sample's `…/visualization/index.html`", + "description": "This option selects whether to generate HTML visualization of the SNVs in each sample's `…/visualization/index.html`.", "examples": [true] }, "diversity": { @@ -199,7 +199,7 @@ "type": "object" }, "applications": { - "description": "The path to the different software packages can be specified using this section.\n\nIt is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the `PATH`.\n\n**Note** we strongly recommend to use conda environments, by adding the `--use-conda` flag to the V-pipe execution command, e.g. `./vpipe --use-conda`. If you prefer to use your own installations, this section allows you to specify the location of the executables", + "description": "The path to the different software packages can be specified using this section.\n\nIt is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the `PATH`.\n\n**Note** we strongly recommend to use conda environments, by adding the `--use-conda` flag to the V-pipe execution command, e.g. `./vpipe --use-conda`. If you prefer to use your own installations, this section allows you to specify the location of the executables.", "examples":[{"bwa":"/path/to/bwa","haploclique":"/path/to/haploclique"}], "properties": { "gunzip": { @@ -217,7 +217,7 @@ "vicuna": { "type": "string", "default": "vicuna", - "description":"Due to a special license, VICUNA is **not** available from bioconda and must be installed from [its original website](https://www.broadinstitute.org/viral-genomics/vicuna).\nUse this option to specify where you have installed its executable" + "description":"Due to a special license, VICUNA is **not** available from bioconda and must be installed from [its original website](https://www.broadinstitute.org/viral-genomics/vicuna).\nUse this option to specify where you have installed its executable." }, "indelfixer": { "type": "string", @@ -483,13 +483,13 @@ "leave_msa_temp": { "type": "boolean", "default": false, - "description": "this option is useful for debugging purposes", + "description": "This option is useful for debugging purposes.", "examples": [true] }, "extra": { "type": "string", "default": "", - "description": "pass additional options to run ngshmmalign\n\nV-pipe uses option `-R `, thus option `-r arg` is not allowed. Also, instead of passing `-l` via the property `extra`, set [`leave_msa_temp`](#hmm_align_leave_msa_temp) to `True`. Lastly, please do not modify options `-o arg`, `-w arg`, `-t arg`, and `-N arg`. These are already managed by V-pipe." + "description": "Pass additional options to run ngshmmalign\n\nV-pipe uses option `-R `, thus option `-r arg` is not allowed. Also, instead of passing `-l` via the property `extra`, set [`leave_msa_temp`](#hmm_align_leave_msa_temp) to `True`. Lastly, please do not modify options `-o arg`, `-w arg`, `-t arg`, and `-N arg`. These are already managed by V-pipe." } }, @@ -641,7 +641,7 @@ "extra": { "type": "string", "default": "", - "description": " With property `extra`, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software [documentation](http://bio-bwa.sourceforge.net/bwa.shtml)." + "description": "With property `extra`, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software [documentation](http://bio-bwa.sourceforge.net/bwa.shtml)." } }, "default": {}, @@ -686,13 +686,13 @@ "type": "string", "default": "--phred33", "enum": ["--phred33","--phred64"], - "description":"indicate if qualities are Phred+33 (default) or Phred+64 (`--phred64`)", + "description":"Indicate if qualities are Phred+33 (default) or Phred+64 (`--phred64`).", "examples": ["--phred64"] }, "preset": { "type": "string", "default": "--local --sensitive-local", - "description": "specify Bowtie 2 presets" + "description": "Specify Bowtie 2 presets." }, "maxins": { "type": "integer" @@ -700,7 +700,7 @@ "extra": { "type": "string", "default": "", - "description": "pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options\nFor more details on Bowtie 2 configurable options refer to the software [documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)." + "description": "Pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options\nFor more details on Bowtie 2 configurable options refer to the software [documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)." } }, "default": {}, @@ -723,22 +723,22 @@ "min_coverage": { "type": "integer", "default": 50, - "description": "minimum read depth for reporting variants per locus" + "description": "Minimum read depth for reporting variants per locus." }, "n_coverage": { "type": "integer", "default": 5, - "description": "read count below which ambiguous base 'n' is reported" + "description": "Read count below which ambiguous base 'n' is reported." }, "qual_thrd": { "type": "integer", "default": 15, - "description": "minimum phred quality score for a base to be included" + "description": "Minimum phred quality score for a base to be included." }, "min_freq": { "type": "number", "default": 0.05, - "description": "minimum frequency for an ambiguous nucleotide" + "description": "Minimum frequency for an ambiguous nucleotide." } }, "default": {}, @@ -885,13 +885,13 @@ "min_coverage": { "type": "integer", "default": 100, - "description": "minimum read depth for reporting variants per locus", + "description": "Minimum read depth for reporting variants per locus.", "examples": [50] }, "frequencies": { "type": "boolean", "default": false, - "description": "output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples", + "description": "Output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples.", "examples": [true] } }, @@ -918,19 +918,19 @@ "overlap": { "type": "boolean", "default": false, - "description": "construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth", + "description": "Construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth.", "examples": [true] }, "coverage": { "type": "integer", "default": 50, - "description": "minimum read depth. A region spanning the reference genome is returned if `coverage` is set to 0", + "description": "Minimum read depth. A region spanning the reference genome is returned if `coverage` is set to 0.", "examples": [ 0 ] }, "liberal": { "type": "boolean", "default": true, - "description": "indicate whether to apply a more liberal shifting on intervals' right-endpoint", + "description": "Indicate whether to apply a more liberal shifting on intervals' right-endpoint.", "examples": [false] } }, @@ -957,18 +957,18 @@ "consensus": { "type": "boolean", "default": true, - "description": "indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False", + "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", "examples": [false] }, "alpha": { "type": "number", "default": 0.1, - "description": "hyperparameter used for instantiating a new cluster" + "description": "Hyperparameter used for instantiating a new cluster." }, "ignore_indels": { "type": "boolean", "default": false, - "description": "ignore SNVs adjacent to indels" + "description": "Ignore SNVs adjacent to indels." }, "posterior_threshold": { "type": "number", @@ -977,18 +977,18 @@ "coverage": { "type": "integer", "default": 0, - "description": "omit windows with coverage less than this value", + "description": "Omit windows with coverage less than this value.", "examples": [50] }, "shift": { "type": "integer", "default": 3, - "description": "ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows" + "description": "ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows." }, "keep_files": { "type": "boolean", "default": false, - "description": "indicate whether to move files produced in previous/interrupted runs to subdirectory named `old`", + "description": "Indicate whether to move files produced in previous/interrupted runs to subdirectory named `old`", "examples": [true] }, "localscratch": { @@ -1034,13 +1034,13 @@ "consensus": { "type": "boolean", "default": true, - "description": "indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False", + "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", "examples": [false] }, "extra": { "type": "string", "default": "", - "description": "pass additional options to run `lofreq call`" + "description": "Pass additional options to run `lofreq call`" } }, "default": {}, @@ -1099,7 +1099,7 @@ "split": { "type": "integer", "default": 20, - "description": "size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/`split` < 1000" + "description": "Size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/`split` < 1000." } }, "default": {}, @@ -1122,32 +1122,32 @@ "relax": { "type": "boolean", "default": true, - "description": "if set to `True` (default) a predefined set of parameter values is used for drawing edges between reads in the read graph" + "description": "If set to `True` (default) a predefined set of parameter values is used for drawing edges between reads in the read graph." }, "no_singletons": { "type": "boolean", "default": true, - "description": "singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to `True`, singletons are discarded" + "description": "Singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to `True`, singletons are discarded." }, "no_prob0": { "type": "boolean", "default": true, - "description": "if set to `True` (default) probability of the overhangs is ignored" + "description": "If set to `True` (default) probability of the overhangs is ignored." }, "clique_size_limit": { "type": "integer", "default": 3, - "description": "sets a threshold to limit the size of cliques" + "description": "Sets a threshold to limit the size of cliques." }, "max_num_cliques": { "type": "integer", "default": 10000, - "description": "indicates the maximum number of clique to be considered in the next iteration" + "description": "Indicates the maximum number of clique to be considered in the next iteration." }, "extra_parameters": { "type": "string", "default": "", - "description": "additional parameters to be passed to haploclique.\n\nWarning: this won't overwrite the other options (e.g. `clique_size_limi` and `max_num_cliques` should still be set via their [own](#haploclique_clique_size_limit) [respective](#haploclique_max_num_cliques) properties, do not pass parameters `--limit_clique_size=` nor `--max_cliques=` via this `extra` property)." + "description": "Additional parameters to be passed to haploclique.\n\nWarning: this won't overwrite the other options (e.g. `clique_size_limi` and `max_num_cliques` should still be set via their [own](#haploclique_clique_size_limit) [respective](#haploclique_max_num_cliques) properties, do not pass parameters `--limit_clique_size=` nor `--max_cliques=` via this `extra` property)." } }, "default": {}, @@ -1170,18 +1170,18 @@ "region_start": { "type": "integer", "default": 0, - "description": "use to specify a region of interest" + "description": "Use to specify a region of interest." }, "region_end": { "type": "integer", "default": -1, - "description": "use to specify a region of interest", + "description": "Use to specify a region of interest.", "examples": [9719,29836] }, "msa": { "type": "string", "default": "", - "description": "when the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed" + "description": "When the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed." } }, "default": {}, @@ -1343,13 +1343,13 @@ "type": "string", "default": "ambig", "enum": ["ambig", "majority"], - "description": "When preparing data for upload, specifies which consensus sequence should be uploaded", + "description": "When preparing data for upload, specifies which consensus sequence should be uploaded.", "examples": ["majority"] }, "checksum": { "type": "boolean", "default": false, - "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous)", + "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous).", "examples": [true] }, "orig_fastq": {