From 416efebc0ff4380323a8c2f0ac6d890082ae3b72 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Mon, 5 Feb 2024 17:03:40 +0100 Subject: [PATCH 01/26] adding viloca to the workflow --- workflow/envs/viloca.yaml | 11 ++++++ workflow/rules/snv.smk | 56 +++++++++++++++++++++++++++- workflow/schemas/config_schema.json | 57 +++++++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 workflow/envs/viloca.yaml diff --git a/workflow/envs/viloca.yaml b/workflow/envs/viloca.yaml new file mode 100644 index 00000000..51aacd4c --- /dev/null +++ b/workflow/envs/viloca.yaml @@ -0,0 +1,11 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - libshorah + - python=3.10.4 + - pip + - pip: + - pandas + - git+https://github.com/cbg-ethz/VILOCA@master diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index 0389569c..f78a26de 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -286,11 +286,63 @@ rule lofreq: {params.LOFREQ} call {params.EXTRA} --call-indels -f {input.REF} -o {output.SNVs} --verbose {output.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) """ +rule viloca: + input: + REF=( + cohortdir("cohort_consensus.fasta") + if config.lofreq["consensus"] + else reference_file + ), + BAM=alignment_wildcard, + output: + SNVs="{dataset}/variants/SNVs/snvs.vcf", + CSV="{dataset}/variants/SNVs/snv/cooccurring_mutations.csv", + params: + READ_LEN=read_len, + INSERT_FILE=config.viloca["insert_bedfile"], + MODE=config.viloca["mode"], + SHIFT=config.viloca["shift"], + OUTDIR="{dataset}/variants/SNVs", + EXTRA=config.viloca["extra"], + VILOCA=config.applications["viloca"], + log: + outfile="{dataset}/variants/SNVs/viloca.out.log", + errfile="{dataset}/variants/SNVs/viloca.err.log", + conda: + config.viloca["conda"] + benchmark: + "{dataset}/variants/SNVs/viloca.benchmark" + threads: config.viloca["threads"] + resources: + disk_mb=2000, + mem_mb=config.viloca["mem"], + runtime=config.viloca["time"], + shell: + """ + let "WINDOW_SHIFTS=({params.READ_LEN} * 4/5 + {params.SHIFT}) / {params.SHIFT}" + let "WINDOW_LEN=WINDOW_SHIFTS * {params.SHIFT}" + + # Run VILOCA + echo "Running VILOCA" >> {log.outfile} + if [[ "{params.INSERT_FILE}" == "None" ]]; then + {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -w ${{WINDOW_LEN}} -s {params.SHIFT} -f {input.REF} -b {input.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + else + {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -z {params.INSERT_FILE} -f {input.REF} -b {input.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + fi + + # rename viloca output snv/SNVs_0.010000_final.vcf --> snvs.vcf + cp "${params.OUTDIR}/snv/SNVs_0.010000_final.vcf" {output.SNVs} + """ + if config.general["snv_caller"] == "shorah": - ruleorder: snv > lofreq + ruleorder: snv > lofreq > viloca elif config.general["snv_caller"] == "lofreq": - ruleorder: lofreq > snv + ruleorder: lofreq > snv > viloca + +elif config.general["snv_caller"] == "viloca": + + ruleorder: viloca > lofreq > snv diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 648e5ef6..a0bb2c6d 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -38,8 +38,8 @@ "snv_caller": { "type": "string", "default": "shorah", - "enum": ["shorah","lofreq"], - "description": "There are two options available for calling single nucleotide variants, either using [ShoRAH (`shorah`)](https://github.com/cbg-ethz/shorah) [^5] or [LoFreq (`lofreq`)](https://csb5.github.io/lofreq/) [^6]. ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example\n\n[^5]: Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011.\n[^6]: Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012.", + "enum": ["shorah","lofreq", "viloca"], + "description": "There are three options available for calling single nucleotide variants, either using [ShoRAH (`shorah`)](https://github.com/cbg-ethz/shorah) [^5], [LoFreq (`lofreq`)](https://csb5.github.io/lofreq/) [^6] or [VILOCA (`viloca`)](https://github.com/cbg-ethz/viloca) [^7] . ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example\n\n[^5]: Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011.\n[^6]: Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012.", "examples": ["lofreq"] }, "haplotype_reconstruction": { @@ -228,7 +228,7 @@ "local": { "type": "boolean", "default": false, - "description": "This option activates local haplotype reconstruction (only available when using ShoRAH).", + "description": "This option activates local haplotype reconstruction (only available when using ShoRAH or VILOCA).", "examples": [true] }, "global": { @@ -386,6 +386,10 @@ "type": "string", "default": "shorah shotgun" }, + "viloca": { + "type": "string", + "default": "viloca run" + }, "lofreq": { "type": "string", "default": "lofreq" @@ -1185,6 +1189,53 @@ "default": {}, "type": "object" }, + "viloca": { + "properties": { + "mem": { + "type": "integer", + "default": 10000 + }, + "threads": { + "type": "integer" + }, + "time": { + "type": "integer", + "default": 2880 + }, + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/viloca.yaml" + }, + "consensus": { + "type": "boolean", + "default": false, + "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", + "examples": [false] + }, + "shift": { + "type": "integer", + "default": 3, + "description": "VILOCA performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows." + }, + "insert_bedfile": { + "type": "string", + "default": "None", + "description": "VILOCA performs local haplotype reconstruction on windows of the read alignment. In a first step the alignment is tiled into local regions. This can be done uniformly then set this value None, otherwise path to an (optional) insert file (primer tiling strategy)" + }, + "mode": { + "type": "string", + "default": "use_quality_scores", + "description": "Mode in which to run VILOCA: shorah, learn_error_params, use_quality_scores. If quality scores are available, we recommend this option" + }, + "extra": { + "type": "string", + "default": "", + "description": "Pass additional options to run `lofreq call`" + } + }, + "default": {}, + "type": "object" + }, "lofreq": { "properties": { "mem": { From 13aee8eaff211ef393c60c79b8ed48c51d80e974 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Tue, 6 Feb 2024 16:46:49 +0100 Subject: [PATCH 02/26] fix file paths --- workflow/rules/snv.smk | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index f78a26de..d732a7b9 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -290,7 +290,7 @@ rule viloca: input: REF=( cohortdir("cohort_consensus.fasta") - if config.lofreq["consensus"] + if config.viloca["consensus"] else reference_file ), BAM=alignment_wildcard, @@ -305,6 +305,7 @@ rule viloca: OUTDIR="{dataset}/variants/SNVs", EXTRA=config.viloca["extra"], VILOCA=config.applications["viloca"], + WORK_DIR="{dataset}/variants/SNVs", log: outfile="{dataset}/variants/SNVs/viloca.out.log", errfile="{dataset}/variants/SNVs/viloca.err.log", @@ -321,17 +322,38 @@ rule viloca: """ let "WINDOW_SHIFTS=({params.READ_LEN} * 4/5 + {params.SHIFT}) / {params.SHIFT}" let "WINDOW_LEN=WINDOW_SHIFTS * {params.SHIFT}" + echo "Windows are shifted by: ${{WINDOW_SHIFTS}} bp" > {log.outfile} + echo "The window length is: ${{WINDOW_LEN}} bp" >> {log.outfile} + + # Get absolute path for input files + CWD=${{PWD}} + BAM=${{PWD}}/{input.BAM} + REF={input.REF}; [[ ${{REF}} =~ ^/ ]] || REF=${{PWD}}/${{REF}} + OUTFILE=${{PWD}}/{log.outfile} + ERRFILE=${{PWD}}/{log.errfile} + WORK_DIR=${{PWD}}/{params.WORK_DIR} + + # Create directory for running VILOCA + DIR=${{WORK_DIR}} + if [[ ! -d "${{DIR}}" ]]; then + echo "Creating directory ${{DIR}}" >> $OUTFILE + mkdir -p ${{DIR}} + fi + # Change to the directory where VILOCA is to be executed + cd ${{DIR}} # Run VILOCA - echo "Running VILOCA" >> {log.outfile} + echo "Running VILOCA" >> $OUTFILE if [[ "{params.INSERT_FILE}" == "None" ]]; then - {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -w ${{WINDOW_LEN}} -s {params.SHIFT} -f {input.REF} -b {input.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -w ${{WINDOW_LEN}} -s {params.SHIFT} -b ${{BAM}} -f ${{REF}} >> $OUTFILE 2> >(tee -a $ERRFILE >&2) else - {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -z {params.INSERT_FILE} -f {input.REF} -b {input.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + INSERTFILE=${{CWD}}/{params.INSERT_FILE} + echo "Insert file used ${{CWD}}/{params.INSERT_FILE}" >> $OUTFILE + {params.VILOCA} {params.EXTRA} -t {threads} --mode {params.MODE} -z ${{INSERTFILE}} -b ${{BAM}} -f ${{REF}} >> $OUTFILE 2> >(tee -a $ERRFILE >&2) fi # rename viloca output snv/SNVs_0.010000_final.vcf --> snvs.vcf - cp "${params.OUTDIR}/snv/SNVs_0.010000_final.vcf" {output.SNVs} + cp "${{WORK_DIR}}/snv/SNVs_0.010000_final.vcf" "${{WORK_DIR}}/snvs.vcf" """ From 7ddb32031b64c4d39f9df13a7a465b83051b30e5 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Tue, 5 Mar 2024 13:17:26 +0100 Subject: [PATCH 03/26] update conig_schema --- workflow/envs/viloca.yaml | 1 - workflow/schemas/config_schema.json | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/envs/viloca.yaml b/workflow/envs/viloca.yaml index 51aacd4c..26434d02 100644 --- a/workflow/envs/viloca.yaml +++ b/workflow/envs/viloca.yaml @@ -7,5 +7,4 @@ dependencies: - python=3.10.4 - pip - pip: - - pandas - git+https://github.com/cbg-ethz/VILOCA@master diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index a0bb2c6d..8db21ae7 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1225,12 +1225,13 @@ "mode": { "type": "string", "default": "use_quality_scores", + "enum": ["shorah","learn_error_params","use_quality_scores"], "description": "Mode in which to run VILOCA: shorah, learn_error_params, use_quality_scores. If quality scores are available, we recommend this option" }, "extra": { "type": "string", "default": "", - "description": "Pass additional options to run `lofreq call`" + "description": "Pass additional options to run `viloca`" } }, "default": {}, From 17f779c5bf028c24a3c34551caed71d568ea0c1f Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Tue, 4 Jun 2024 18:34:47 +0200 Subject: [PATCH 04/26] adding paired_end_read_merger --- workflow/rules/snv.smk | 57 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index d732a7b9..f9617383 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -286,6 +286,55 @@ rule lofreq: {params.LOFREQ} call {params.EXTRA} --call-indels -f {input.REF} -o {output.SNVs} --verbose {output.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) """ +rule prep_paired_end_read_merger: + conda: + config.sam2bam["conda"] + input: + fname_bam=alignment_wildcard, + fname_ref=( + cohortdir("cohort_consensus.fasta") + if config.viloca["consensus"] + else reference_file + ), + params: + SAMTOOLS=config.applications["samtools"], + output: + fname_sam=temp(f"results/{{sample}}/alignment/REF_aln.sam"), + fname_sam_sort=temp(f"results/{{sample}}/alignment/REF_aln.sort.sam"), + shell: + """ + ## Preparation + fname_reference_idx = "${input.fname_reference_idx}.fai" + {params.SAMTOOLS} view -h -T {input.fname_reference} -t ${fname_reference_idx} {input.fname_bam} > {output.fname_sam} + ## sort accrording to QNAME + {params.SAMTOOLS} sort -T tmp -O sam -n {output.fname_sam} > {output.fname_sam_sort} + """ + +rule paired_end_read_merger: + input: + fname_sam_sort=rules.prep_paired_end_read_merger.output.fname_sam_sort, + fname_ref=( + cohortdir("cohort_consensus.fasta") + if config.viloca["consensus"] + else reference_file + ), + output: + fname_sam_merged=f"results/{{sample}}/alignment/REF_aln.merged.sam", + fname_sam_nonmerged=f"results/{{sample}}/alignment/REF_aln.nonmerged.sam", + params: + PAIRED_END_READ_MERGER=config.applications["paired_end_read_merger"], + log: + outfile="{dataset}/alignment/paired_end_read_merger.out.log", + errfile="{dataset}/alignment/paired_end_read_merger.err.log", + conda: + config.paired_end_read_merger["conda"] + shell: + """ + ## run script + {params.PAIRED_END_READ_MERGER} {input.fname_sam_sort} {output.fname_sam_merged} {output.fname_sam_nonmerged} {input.fname_ref} + """ + + rule viloca: input: REF=( @@ -293,7 +342,13 @@ rule viloca: if config.viloca["consensus"] else reference_file ), - BAM=alignment_wildcard, + BAM=( + rules.paired_end_read_merger.output.fname_sam_merged, + if config.viloca["merge_paired_end_reads"] + else alignment_wildcard + ), + + output: SNVs="{dataset}/variants/SNVs/snvs.vcf", CSV="{dataset}/variants/SNVs/snv/cooccurring_mutations.csv", From 15d75b3fc071e17c7daea5dd5501a90bb5c0bcf8 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:26:52 +0200 Subject: [PATCH 05/26] update rule for paired_en_read_merger + add conda env definition --- workflow/envs/paired_end_read_merger.yaml | 8 +++++ workflow/rules/snv.smk | 37 +++++++---------------- 2 files changed, 19 insertions(+), 26 deletions(-) create mode 100644 workflow/envs/paired_end_read_merger.yaml diff --git a/workflow/envs/paired_end_read_merger.yaml b/workflow/envs/paired_end_read_merger.yaml new file mode 100644 index 00000000..115a61d4 --- /dev/null +++ b/workflow/envs/paired_end_read_merger.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda +dependencies: + - samtools=1.10 + - pip + - pip: + - git+https://github.com/cbg-ethz/smallgenomeutilities@master diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index f9617383..68a4e838 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -286,33 +286,10 @@ rule lofreq: {params.LOFREQ} call {params.EXTRA} --call-indels -f {input.REF} -o {output.SNVs} --verbose {output.BAM} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) """ -rule prep_paired_end_read_merger: - conda: - config.sam2bam["conda"] - input: - fname_bam=alignment_wildcard, - fname_ref=( - cohortdir("cohort_consensus.fasta") - if config.viloca["consensus"] - else reference_file - ), - params: - SAMTOOLS=config.applications["samtools"], - output: - fname_sam=temp(f"results/{{sample}}/alignment/REF_aln.sam"), - fname_sam_sort=temp(f"results/{{sample}}/alignment/REF_aln.sort.sam"), - shell: - """ - ## Preparation - fname_reference_idx = "${input.fname_reference_idx}.fai" - {params.SAMTOOLS} view -h -T {input.fname_reference} -t ${fname_reference_idx} {input.fname_bam} > {output.fname_sam} - ## sort accrording to QNAME - {params.SAMTOOLS} sort -T tmp -O sam -n {output.fname_sam} > {output.fname_sam_sort} - """ rule paired_end_read_merger: input: - fname_sam_sort=rules.prep_paired_end_read_merger.output.fname_sam_sort, + fname_bam=alignment_wildcard, fname_ref=( cohortdir("cohort_consensus.fasta") if config.viloca["consensus"] @@ -320,8 +297,11 @@ rule paired_end_read_merger: ), output: fname_sam_merged=f"results/{{sample}}/alignment/REF_aln.merged.sam", - fname_sam_nonmerged=f"results/{{sample}}/alignment/REF_aln.nonmerged.sam", params: + fname_sam=temp(f"results/{{sample}}/alignment/REF_aln.sam"), + fname_sam_nonmerged=f"results/{{sample}}/alignment/REF_aln.nonmerged.sam", + fname_sam_sort=temp(f"results/{{sample}}/alignment/REF_aln.sort.sam"), + SAMTOOLS=config.applications["samtools"], PAIRED_END_READ_MERGER=config.applications["paired_end_read_merger"], log: outfile="{dataset}/alignment/paired_end_read_merger.out.log", @@ -330,8 +310,13 @@ rule paired_end_read_merger: config.paired_end_read_merger["conda"] shell: """ + ## Preparation + fname_reference_idx = "${input.fname_reference_idx}.fai" + {params.SAMTOOLS} view -h -T {input.fname_reference} -t ${fname_reference_idx} {input.fname_bam} > {params.fname_sam} + ## sort accrording to QNAME + {params.SAMTOOLS} sort -T tmp -O sam -n {params.fname_sam} > {params.fname_sam_sort} ## run script - {params.PAIRED_END_READ_MERGER} {input.fname_sam_sort} {output.fname_sam_merged} {output.fname_sam_nonmerged} {input.fname_ref} + {params.PAIRED_END_READ_MERGER} {input.fname_sam_sort} {output.fname_sam_merged} {params.fname_sam_nonmerged} {input.fname_ref} """ From c2c2ae7673c0b9c3e052181d709f3cfdbc4d8931 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:29:05 +0200 Subject: [PATCH 06/26] update rule for paired_en_read_merger + add conda env definition --- workflow/rules/snv.smk | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index 68a4e838..5fd4bc73 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -332,8 +332,6 @@ rule viloca: if config.viloca["merge_paired_end_reads"] else alignment_wildcard ), - - output: SNVs="{dataset}/variants/SNVs/snvs.vcf", CSV="{dataset}/variants/SNVs/snv/cooccurring_mutations.csv", From 5174e9aec55a4a4aaaf27a6cf3f5386ea3aeed67 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:51:49 +0200 Subject: [PATCH 07/26] update enviroment --- workflow/envs/paired_end_read_merger.yaml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/workflow/envs/paired_end_read_merger.yaml b/workflow/envs/paired_end_read_merger.yaml index 115a61d4..2bccdb22 100644 --- a/workflow/envs/paired_end_read_merger.yaml +++ b/workflow/envs/paired_end_read_merger.yaml @@ -3,6 +3,19 @@ channels: - bioconda dependencies: - samtools=1.10 + - python >=3 + - progress + - numpy + - pandas + - pyyaml + - biopython + - bcbio-gff + - pysam + - pysamstats + - scikit-learn + - scipy + - matplotlib-base + - mafft - pip - pip: - - git+https://github.com/cbg-ethz/smallgenomeutilities@master + - git+https://github.com/cbg-ethz/smallgenomeutilities@dev From 3e79e5e4e3988544a6e6dcaa56a9a77efe18ab5d Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Wed, 5 Jun 2024 16:09:55 +0200 Subject: [PATCH 08/26] add paired_end_read_merger --- workflow/schemas/config_schema.json | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 8db21ae7..cc25d855 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -338,6 +338,10 @@ "type": "string", "default": "frameshift_deletions_checks" }, + "paired_end_read_merger": { + "type": "string", + "default": "paired_end_read_merger" + }, "mafft": { "type": "string", "default": "mafft" @@ -993,6 +997,27 @@ "default": {}, "type": "object" }, + "paired_end_read_merger": { + "properties": { + "mem": { + "type": "integer", + "default": 1250 + }, + "time": { + "type": "integer", + "default": 30 + }, + "threads": { + "type": "integer" + }, + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/paired_end_read_merger.yaml" + } + }, + "default": {}, + "type": "object" + }, "basecounts": { "properties": { "mem": { From 206e056f2e73d28b3f65cd15abbd69ab65c36d28 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:28:51 +0200 Subject: [PATCH 09/26] update paired_end_read_merger rule --- workflow/envs/paired_end_read_merger.yaml | 1 + workflow/rules/snv.smk | 30 ++++++++++++++++------- workflow/schemas/config_schema.json | 6 +++++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/workflow/envs/paired_end_read_merger.yaml b/workflow/envs/paired_end_read_merger.yaml index 2bccdb22..8b655505 100644 --- a/workflow/envs/paired_end_read_merger.yaml +++ b/workflow/envs/paired_end_read_merger.yaml @@ -16,6 +16,7 @@ dependencies: - scipy - matplotlib-base - mafft + - git-lfs - pip - pip: - git+https://github.com/cbg-ethz/smallgenomeutilities@dev diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index 5fd4bc73..462e337f 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -295,14 +295,21 @@ rule paired_end_read_merger: if config.viloca["consensus"] else reference_file ), + fname_ref_idx=( + cohortdir("cohort_consensus.fasta") + if config.viloca["consensus"] + else reference_file + )+".fai", output: - fname_sam_merged=f"results/{{sample}}/alignment/REF_aln.merged.sam", + fname_bam_merged="{dataset}/alignment/REF_aln.merged.bam", + fname_sam_merged=temp_with_prefix("{dataset}/alignment/REF_aln.merged.sam"), + fname_sam=temp_with_prefix("{dataset}/alignment/REF_aln.sam"), + fname_sam_nonmerged="{dataset}/alignment/REF_aln.nonmerged.sam", + fname_sam_sort=temp_with_prefix("{dataset}/alignment/REF_aln.sort.sam"), params: - fname_sam=temp(f"results/{{sample}}/alignment/REF_aln.sam"), - fname_sam_nonmerged=f"results/{{sample}}/alignment/REF_aln.nonmerged.sam", - fname_sam_sort=temp(f"results/{{sample}}/alignment/REF_aln.sort.sam"), SAMTOOLS=config.applications["samtools"], PAIRED_END_READ_MERGER=config.applications["paired_end_read_merger"], + sort_tmp=temp_prefix("{dataset}.tmp"), log: outfile="{dataset}/alignment/paired_end_read_merger.out.log", errfile="{dataset}/alignment/paired_end_read_merger.err.log", @@ -311,12 +318,17 @@ rule paired_end_read_merger: shell: """ ## Preparation - fname_reference_idx = "${input.fname_reference_idx}.fai" - {params.SAMTOOLS} view -h -T {input.fname_reference} -t ${fname_reference_idx} {input.fname_bam} > {params.fname_sam} + {params.SAMTOOLS} view -h -T {input.fname_ref} -t {input.fname_ref_idx} {input.fname_bam} -o {output.fname_sam} > {log.outfile} 2> >(tee {log.errfile} >&2) ## sort accrording to QNAME - {params.SAMTOOLS} sort -T tmp -O sam -n {params.fname_sam} > {params.fname_sam_sort} + rm -f '{params.sort_tmp}'.[0-9]*.bam + {params.SAMTOOLS} sort -T "{params.sort_tmp}" -O sam -n {output.fname_sam} -o {output.fname_sam_sort} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) ## run script - {params.PAIRED_END_READ_MERGER} {input.fname_sam_sort} {output.fname_sam_merged} {params.fname_sam_nonmerged} {input.fname_ref} + {params.PAIRED_END_READ_MERGER} {output.fname_sam_sort} {output.fname_sam_merged} {output.fname_sam_nonmerged} {input.fname_ref} >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + touch {output.fname_sam_nonmerged} + ## sort + rm -f '{params.sort_tmp}'.[0-9]*.bam + {params.SAMTOOLS} sort -T "{params.sort_tmp}" -o "{output.fname_bam_merged}" "{output.fname_sam_merged}" >> {log.outfile} 2> >(tee -a {log.errfile} >&2) + {params.SAMTOOLS} index "{output.fname_bam_merged}" >> {log.outfile} 2> >(tee -a {log.errfile} >&2) """ @@ -328,7 +340,7 @@ rule viloca: else reference_file ), BAM=( - rules.paired_end_read_merger.output.fname_sam_merged, + rules.paired_end_read_merger.output.fname_bam_merged if config.viloca["merge_paired_end_reads"] else alignment_wildcard ), diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index cc25d855..797c9a3b 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1237,6 +1237,12 @@ "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", "examples": [false] }, + "merge_paired_end_reads": { + "type": "boolean", + "default": false, + "description": "Merge paired-end reads in the preprocessing. This is a preprocessing snakemake rule.", + "examples": [false] + }, "shift": { "type": "integer", "default": 3, From 4ccd9d896059b96c9537a82f1c6e9d3949ee0303 Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Thu, 6 Jun 2024 15:38:52 +0200 Subject: [PATCH 10/26] add conda package to env --- workflow/envs/viloca.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/workflow/envs/viloca.yaml b/workflow/envs/viloca.yaml index 26434d02..1f556084 100644 --- a/workflow/envs/viloca.yaml +++ b/workflow/envs/viloca.yaml @@ -1,10 +1,7 @@ channels: - conda-forge - bioconda - - defaults dependencies: - libshorah - python=3.10.4 - - pip - - pip: - - git+https://github.com/cbg-ethz/VILOCA@master + - viloca From 2af73f9aaa0c7be767f12230278ac81f3393c12d Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 7 Jun 2024 17:35:30 +0200 Subject: [PATCH 11/26] Viloca touch ups - snakemake handles `directory()` - VILOCA 1.0.0 on bioconda - use GitLab for smallgenomeutilites due to GitHub LFS limitations --- workflow/envs/paired_end_read_merger.yaml | 2 +- workflow/envs/viloca.yaml | 3 ++- workflow/rules/snv.smk | 17 ++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/workflow/envs/paired_end_read_merger.yaml b/workflow/envs/paired_end_read_merger.yaml index 8b655505..5535dbd2 100644 --- a/workflow/envs/paired_end_read_merger.yaml +++ b/workflow/envs/paired_end_read_merger.yaml @@ -19,4 +19,4 @@ dependencies: - git-lfs - pip - pip: - - git+https://github.com/cbg-ethz/smallgenomeutilities@dev + - git+https://git.bsse.ethz.ch/cbg/viruses/smallgenomeutilities@dev diff --git a/workflow/envs/viloca.yaml b/workflow/envs/viloca.yaml index 1f556084..18c92cd9 100644 --- a/workflow/envs/viloca.yaml +++ b/workflow/envs/viloca.yaml @@ -4,4 +4,5 @@ channels: dependencies: - libshorah - python=3.10.4 - - viloca + - viloca=1.0.0 + - coreutils # [not linux] diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index 462e337f..9572cad9 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -347,15 +347,14 @@ rule viloca: output: SNVs="{dataset}/variants/SNVs/snvs.vcf", CSV="{dataset}/variants/SNVs/snv/cooccurring_mutations.csv", + WORK_DIR=directory("{dataset}/variants/SNVs"), params: READ_LEN=read_len, INSERT_FILE=config.viloca["insert_bedfile"], MODE=config.viloca["mode"], SHIFT=config.viloca["shift"], - OUTDIR="{dataset}/variants/SNVs", EXTRA=config.viloca["extra"], VILOCA=config.applications["viloca"], - WORK_DIR="{dataset}/variants/SNVs", log: outfile="{dataset}/variants/SNVs/viloca.out.log", errfile="{dataset}/variants/SNVs/viloca.err.log", @@ -377,20 +376,20 @@ rule viloca: # Get absolute path for input files CWD=${{PWD}} - BAM=${{PWD}}/{input.BAM} - REF={input.REF}; [[ ${{REF}} =~ ^/ ]] || REF=${{PWD}}/${{REF}} - OUTFILE=${{PWD}}/{log.outfile} - ERRFILE=${{PWD}}/{log.errfile} - WORK_DIR=${{PWD}}/{params.WORK_DIR} + WORK_DIR="$(realpath -m {ouput.WORK_DIR})" + BAM="$(realpath {input.BAM})" + REF="$(realpath {input.REF})" + OUTFILE="$(realpath -m {log.outfile})" + ERRFILE="$(realpath -m {log.errfile})" # Create directory for running VILOCA - DIR=${{WORK_DIR}} + DIR="${{WORK_DIR}}" if [[ ! -d "${{DIR}}" ]]; then echo "Creating directory ${{DIR}}" >> $OUTFILE mkdir -p ${{DIR}} fi # Change to the directory where VILOCA is to be executed - cd ${{DIR}} + cd "${{DIR}}" # Run VILOCA echo "Running VILOCA" >> $OUTFILE From 59386c7762847e01d0cba1be2c47dc7ccc67d783 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 7 Jun 2024 17:39:05 +0200 Subject: [PATCH 12/26] Linting --- workflow/rules/snv.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index 9572cad9..ca00df27 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -299,7 +299,8 @@ rule paired_end_read_merger: cohortdir("cohort_consensus.fasta") if config.viloca["consensus"] else reference_file - )+".fai", + ) + + ".fai", output: fname_bam_merged="{dataset}/alignment/REF_aln.merged.bam", fname_sam_merged=temp_with_prefix("{dataset}/alignment/REF_aln.merged.sam"), From c948d2e315da629719369edbaac8217edfe98269 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 7 Jun 2024 17:42:02 +0200 Subject: [PATCH 13/26] Generate copnfig manual --- config/config.html | 230 +++++++++++++++++++++++---------------------- 1 file changed, 116 insertions(+), 114 deletions(-) diff --git a/config/config.html b/config/config.html index 10359a1e..d3b31209 100644 --- a/config/config.html +++ b/config/config.html @@ -1,22 +1,22 @@ - V-pipe configuration

V-pipe configuration

Type: object

The V-pipe workflow can be customized through the configuration file config.yaml or config.json or, for backward compatibility with the legacy INI-style format used in V-pipe v1.x/2.x, vpipe.config. This configuration file is a text file written using a basic structure composed of sections, properties and values. When using YAML or JSON format use these languages associative array/dictionaries in two levels for sections and properties. When using the older INI format, sections are expected in squared brackets, and properties are followed by corresponding values.

Further more, it is possible to specify additional options on the command line using Snakemake’s --configfile to pass additional YAML/JSON configuration files, and/or using Snakemake’s --config to pass sections and properties in a YAML Flow style/JSON syntax.

The order of precedence is:
command line options (--config, --configfile) >> default configuration file (config/config.yaml or config.yaml) >> legacy configuration INI (vpipe.config) >> Virus-specific base config (virus_based_config) >> default values

Example: For instance, we suggest providing as input a tabular file specifying sample unique identifiers (e.g., patient identifiers), and dates for different sequencing runs related to the same patient. The name of this file (here, samples.tsv) can be provided by specifying the section as input and the property as samples_file, as follows in the example below.

In this document, we provide a comprehensive list of all user-configurable options stratified by sections.


Example:

input:
-  samples_file: samples.tsv
-

Type: object Default: {}

This section of the configuration provides general options that control the overall behavior of the pipeline.

Type: string Default: ""

We provide virus-specific base configuration files which contain handy defaults for, e.g., HIV and SARS-CoV-2. Check the git repository’s config subdirectory to learn about them.


Examples:

hiv
-
sars-cov-2
-

Type: enum (of string) Default: "prinseq"

By default trimming and clipping of reads is performed by PRINSEQ 1 – a versatile raw read processor for short-reads with many customization options, that we use mostly for Illumina short-read sequencing.
Some other sequencing platforms, e.g., Oxford Nanopore Technologies, are not compatible with this software and usually perform quality control during the fast5 basecalling and demultiplexing anyway, e.g., by Guppy. Use skip to avoid performing preprocessing such already quality-trimmed fastq files.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 

Must be one of:

  • "prinseq"
  • "skip"
  • ""

Example:

skip
-

Type: enum (of string) Default: "ngshmmalign"

There are three options for mapping reads, either using ngshmmalign, BWA MEM (bwa) 1, Bowtie 2 (bowtie) 2, or minimap2 (minimap)3. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.

Note: Some virus-specific base configuration specified in virus_base_config might change this option’s default to a more appropriate aligner for that virus, e.g., depending on its usual diversity and mutation rate.
You are still free to override that default in your configuration shall the need arise.


  1. Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013. 

  2. Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012. 

  3. Li, H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics. 2018. 

Must be one of:

  • "ngshmmalign"
  • "bwa"
  • "bowtie"
  • "minimap"

Example:

minimap
-

Type: enum (of string) Default: "ivar"

There are two options available for trimming primers, either using iVar trim (ivar) 1 or Samtools ampliconclip (samtools) 2. iVar trim is used by default. If you prefer to use Samtools ampliconclip, then indicate so in the configuration file as in the example


  1. Grubaugh, N. et al. An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar. Genome Biology. 2019. 

  2. Danecek P, et al. Twelve years of SAMtools and BCFtools. GigaScience. 2021 

Must be one of:

  • "ivar"
  • "samtools"

Example:

samtools
-

Type: enum (of string) Default: "shorah"

There are two options available for calling single nucleotide variants, either using ShoRAH (shorah) 1 or LoFreq (lofreq) 2. ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example


  1. Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011. 

  2. Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012. 

Must be one of:

  • "shorah"
  • "lofreq"

Example:

lofreq
-

Type: enum (of string) Default: "savage"

There are three options available for haplotype reconstruction, namely SAVAGE 1, HaploClique 2 or PredictHaplo 3. SAVAGE is used by default. If you wish to use HaploClique, then indicate it in the configuration file as in the example.


  1. Baaijens, J. A. et al., De novo assembly of viral quasispecies using overlap graphs. Genome Res. 2017. 

  2. Töpfer, A. et al. Viral quasispecies assembly via maximal clique finding. PLOS Computational Biology. 2014. 

  3. Prabhakaran, S. et al. HIV haplotype inference using a propagating dirichlet process mixture model. IEEE/ACM transactions on computational biology and bioinformatics 11.1. 2013. 

Must be one of:

  • "savage"
  • "haploclique"
  • "predicthaplo"

Example:

haploclique
-

Type: integer Default: 1

This option should be used to specify the default number of threads for all multi-threaded rules. That is, unless the number of threads is specified for each rule, this value is set as default.

Value must be greater or equal to 1


Example:

4
-

Type: enum (of string) Default: "md5"

Sets the algorithm to be used when computing checksums for uploadable data.

Must be one of:

  • "md5"
  • "sha1"
  • "sha256"
  • "sha224"
  • "sha384"
  • "sha512"
  • "xxh64"
  • "xxh32"
  • "xxh128"

Example:

sha256
-

Type: string Default: ""

Some step of V-pipe produce temporary files such as, e.g., decompressed intermediate — i.e. files which aren’t kept long-term but are deleted after all steps that needed them have finished. By default, these files are written in the output data directory. This option, makes it is possible to write them in a different directory instead. Use this option to, e.g., leverage a faster cluster-local storage or avoid wasting backup space on a snapshotted storage. You might want to consult the documentation provided by your HPC.


Examples:

temp
-
/cluster/scratch
-

Type: enum (of integer) Default: 1

Specify whether TSV files like coverage and base counts should be 1-based (i.e.: the first base pair position is called 1) like standard practice used in biology and most text formats such as VCF and GFF, or should be 0-based (i.e.: the first base pair position is called 0) like in several Python tools such as pysam and the BED format.

By default V-pipe uses 1-based TSV file (position column starts with 1), but this option change the behaviour.

Must be one of:

  • 0
  • 1

Example:

0
+ V-pipe configuration 

V-pipe configuration

Type: object

The V-pipe workflow can be customized through the configuration file config.yaml or config.json or, for backward compatibility with the legacy INI-style format used in V-pipe v1.x/2.x, vpipe.config. This configuration file is a text file written using a basic structure composed of sections, properties and values. When using YAML or JSON format use these languages associative array/dictionaries in two levels for sections and properties. When using the older INI format, sections are expected in squared brackets, and properties are followed by corresponding values.

Further more, it is possible to specify additional options on the command line using Snakemake’s --configfile to pass additional YAML/JSON configuration files, and/or using Snakemake’s --config to pass sections and properties in a YAML Flow style/JSON syntax.

The order of precedence is:
command line options (--config, --configfile) >> default configuration file (config/config.yaml or config.yaml) >> legacy configuration INI (vpipe.config) >> Virus-specific base config (virus_based_config) >> default values

Example: For instance, we suggest providing as input a tabular file specifying sample unique identifiers (e.g., patient identifiers), and dates for different sequencing runs related to the same patient. The name of this file (here, samples.tsv) can be provided by specifying the section as input and the property as samples_file, as follows in the example below.

In this document, we provide a comprehensive list of all user-configurable options stratified by sections.


Example:

input:
+  samples_file: samples.tsv
+

Type: object Default: {}

This section of the configuration provides general options that control the overall behavior of the pipeline.

Type: string Default: ""

We provide virus-specific base configuration files which contain handy defaults for, e.g., HIV and SARS-CoV-2. Check the git repository’s config subdirectory to learn about them.


Examples:

hiv
+
sars-cov-2
+

Type: enum (of string) Default: "prinseq"

By default trimming and clipping of reads is performed by PRINSEQ 1 – a versatile raw read processor for short-reads with many customization options, that we use mostly for Illumina short-read sequencing.
Some other sequencing platforms, e.g., Oxford Nanopore Technologies, are not compatible with this software and usually perform quality control during the fast5 basecalling and demultiplexing anyway, e.g., by Guppy. Use skip to avoid performing preprocessing such already quality-trimmed fastq files.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 

Must be one of:

  • "prinseq"
  • "skip"
  • ""

Example:

skip
+

Type: enum (of string) Default: "ngshmmalign"

There are three options for mapping reads, either using ngshmmalign, BWA MEM (bwa) 1, Bowtie 2 (bowtie) 2, or minimap2 (minimap)3. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.

Note: Some virus-specific base configuration specified in virus_base_config might change this option’s default to a more appropriate aligner for that virus, e.g., depending on its usual diversity and mutation rate.
You are still free to override that default in your configuration shall the need arise.


  1. Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013. 

  2. Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012. 

  3. Li, H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics. 2018. 

Must be one of:

  • "ngshmmalign"
  • "bwa"
  • "bowtie"
  • "minimap"

Example:

minimap
+

Type: enum (of string) Default: "ivar"

There are two options available for trimming primers, either using iVar trim (ivar) 1 or Samtools ampliconclip (samtools) 2. iVar trim is used by default. If you prefer to use Samtools ampliconclip, then indicate so in the configuration file as in the example


  1. Grubaugh, N. et al. An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar. Genome Biology. 2019. 

  2. Danecek P, et al. Twelve years of SAMtools and BCFtools. GigaScience. 2021 

Must be one of:

  • "ivar"
  • "samtools"

Example:

samtools
+

Type: enum (of string) Default: "shorah"

There are three options available for calling single nucleotide variants, either using ShoRAH (shorah) 1, LoFreq (lofreq) 2 or VILOCA (viloca) [^7] . ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example


  1. Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011. 

  2. Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012. 

Must be one of:

  • "shorah"
  • "lofreq"
  • "viloca"

Example:

lofreq
+

Type: enum (of string) Default: "savage"

There are three options available for haplotype reconstruction, namely SAVAGE 1, HaploClique 2 or PredictHaplo 3. SAVAGE is used by default. If you wish to use HaploClique, then indicate it in the configuration file as in the example.


  1. Baaijens, J. A. et al., De novo assembly of viral quasispecies using overlap graphs. Genome Res. 2017. 

  2. Töpfer, A. et al. Viral quasispecies assembly via maximal clique finding. PLOS Computational Biology. 2014. 

  3. Prabhakaran, S. et al. HIV haplotype inference using a propagating dirichlet process mixture model. IEEE/ACM transactions on computational biology and bioinformatics 11.1. 2013. 

Must be one of:

  • "savage"
  • "haploclique"
  • "predicthaplo"

Example:

haploclique
+

Type: integer Default: 1

This option should be used to specify the default number of threads for all multi-threaded rules. That is, unless the number of threads is specified for each rule, this value is set as default.

Value must be greater or equal to 1


Example:

4
+

Type: enum (of string) Default: "md5"

Sets the algorithm to be used when computing checksums for uploadable data.

Must be one of:

  • "md5"
  • "sha1"
  • "sha256"
  • "sha224"
  • "sha384"
  • "sha512"
  • "xxh64"
  • "xxh32"
  • "xxh128"

Example:

sha256
+

Type: string Default: ""

Some step of V-pipe produce temporary files such as, e.g., decompressed intermediate — i.e. files which aren’t kept long-term but are deleted after all steps that needed them have finished. By default, these files are written in the output data directory. This option, makes it is possible to write them in a different directory instead. Use this option to, e.g., leverage a faster cluster-local storage or avoid wasting backup space on a snapshotted storage. You might want to consult the documentation provided by your HPC.


Examples:

temp
+
/cluster/scratch
+

Type: enum (of integer) Default: 1

Specify whether TSV files like coverage and base counts should be 1-based (i.e.: the first base pair position is called 1) like standard practice used in biology and most text formats such as VCF and GFF, or should be 0-based (i.e.: the first base pair position is called 0) like in several Python tools such as pysam and the BED format.

By default V-pipe uses 1-based TSV file (position column starts with 1), but this option change the behaviour.

Must be one of:

  • 0
  • 1

Example:

0
 

Type: string Default: "/"

Charater to use when assembling the two levels (e.g.: sample and a date), into a column title to be used in a report TSV file

E.g., with this sample file

patient1    20100113
 patient1    20110202
 patient2    20081130
-

the coverage TSV file’s column will be called patient1/20100113, patient1/20110202 and patient2/20081130.


Example:

-
-

Type: object Default: {}

Properties in this section of the configuration control the input of the pipeline.

Type: string Default: "samples/"

The input file for the workflow will be searched in this directory.

V-pipe expects the input samples to be organized in a two-level directory hierarchy.

  • The first level can be, e.g., patient samples or biological replicates of an experiment.
  • The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.
  • Inside that directory, the sub-directory raw_data holds the sequencing data in FASTQ format (optionally compressed with GZip).

For example:

📁samples
+

the coverage TSV file’s column will be called patient1/20100113, patient1/20110202 and patient2/20081130.


Example:

-
+

Type: object Default: {}

Properties in this section of the configuration control the input of the pipeline.

Type: string Default: "samples/"

The input file for the workflow will be searched in this directory.

V-pipe expects the input samples to be organized in a two-level directory hierarchy.

  • The first level can be, e.g., patient samples or biological replicates of an experiment.
  • The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.
  • Inside that directory, the sub-directory raw_data holds the sequencing data in FASTQ format (optionally compressed with GZip).

For example:

📁samples
 ├──📁patient1
 │  ├──📁20100113
 │  │  └──📁raw_data
@@ -31,55 +31,55 @@
       └──📁raw_data
          ├──🧬patient2_20081130_R1.fastq.gz
          └──🧬patient2_20081130_R2.fastq.gz
-

Examples:

tests/data/hiv/
-
tests/data/sars-cov-2/
-

Type: boolean Default: true

Indicate whether the input sequencing reads correspond to paired-end reads.

Paired-ended reads need to be in split files with _R1 and _R2 suffixes:

📁raw_data
+

Examples:

tests/data/hiv/
+
tests/data/sars-cov-2/
+

Type: boolean Default: true

Indicate whether the input sequencing reads correspond to paired-end reads.

Paired-ended reads need to be in split files with _R1 and _R2 suffixes:

📁raw_data
 ├──🧬patient2_20081130_R1.fastq.gz
 └──🧬patient2_20081130_R2.fastq.gz
-

Example:

False
-

Type: string Default: ""

V-pipe expects paired-end reads to be in files that end in _R1 and _R2 exactly right before the file extension, e.g., _R1.fastq.gz, because this is how the workflow finds and recognizes them.

But Illumina’s bcl2fastq demultiplexer might introduce additional strings, e.g., _R2_001.fast.gz or, depending on its mismatches settings, e.g., _R2_001_MM_1.fast.gz. Use this options to specify anything which should go between the _R1 and _R2 endings and the file extension.


Examples:

_001
-
_001_MM_1
+

Example:

False
+

Type: string Default: ""

V-pipe expects paired-end reads to be in files that end in _R1 and _R2 exactly right before the file extension, e.g., _R1.fastq.gz, because this is how the workflow finds and recognizes them.

But Illumina’s bcl2fastq demultiplexer might introduce additional strings, e.g., _R2_001.fast.gz or, depending on its mismatches settings, e.g., _R2_001_MM_1.fast.gz. Use this options to specify anything which should go between the _R1 and _R2 endings and the file extension.


Examples:

_001
+
_001_MM_1
 

Type: string Default: "config/samples.tsv"

File containing sample unique identifiers and dates as tab-separated values, e.g.,

patient1    20100113
 patient1    20110202
 patient2    20081130
-

Here, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.

Optionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.

Optionally, a fourth column can contain a short name of a protocol (e.g.: v3) that is detailed in the file specified in input => protocols_file. This is useful if protocol details such as primers change over time, e.g. to adapt to new variants with SNV breaking primer binding affinity.

Standardized Snakemake workflows place their tables inside the config/ subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x).


Example:

samples.tsv
-

Type: string Default: ""

When different samples have been processed with different library protocols, this file specifies a lookup table with per-protocol specific (primers bed and fasta), eg.:

v41:
-  name: SARS-CoV-2 ARTIC V4.1
-  inserts_bedfile: references/primers/v41/SARS-CoV-2.insert.bed
-  primers_bedfile: references/primers/v41/SARS-CoV-2.primer.bed
-  primers_file: references/primers/v41/SARS-CoV-2.tsv
-  primers_fasta: references/primers/v41/SARS-CoV-2.primer.fasta
-v4:
-  name: SARS-CoV-2 ARTIC V4
-  inserts_bedfile: references/primers/v4/SARS-CoV-2.insert.bed
-  primers_bedfile: references/primers/v4/SARS-CoV-2.primer.bed
-  primers_file: references/primers/v4/SARS-CoV-2.tsv
-  primers_fasta: references/primers/v4/ARTIC_v4.fasta
-v3:
-  name: SARS-CoV-2 ARTIC V3
-  inserts_bedfile: references/primers/v3/nCoV-2019.insert.bed
-  primers_bedfile: references/primers/v3/nCoV-2019.primer.bed
-  primers_file: references/primers/v3/nCoV-2019.tsv
-  primers_fasta: references/primers/v3/ARTIC_v3.fasta
-

The short name can then be referenced in the samples TSV table file:

sample_a    20211108    250 v3
+

Here, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.

Optionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.

Optionally, a fourth column can contain a short name of a protocol (e.g.: v3) that is detailed in the file specified in input => protocols_file. This is useful if protocol details such as primers change over time, e.g. to adapt to new variants with SNV breaking primer binding affinity.

Standardized Snakemake workflows place their tables inside the config/ subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x).


Example:

samples.tsv
+

Type: string Default: ""

When different samples have been processed with different library protocols, this file specifies a lookup table with per-protocol specific (primers bed and fasta), eg.:

v41:
+  name: SARS-CoV-2 ARTIC V4.1
+  inserts_bedfile: references/primers/v41/SARS-CoV-2.insert.bed
+  primers_bedfile: references/primers/v41/SARS-CoV-2.primer.bed
+  primers_file: references/primers/v41/SARS-CoV-2.tsv
+  primers_fasta: references/primers/v41/SARS-CoV-2.primer.fasta
+v4:
+  name: SARS-CoV-2 ARTIC V4
+  inserts_bedfile: references/primers/v4/SARS-CoV-2.insert.bed
+  primers_bedfile: references/primers/v4/SARS-CoV-2.primer.bed
+  primers_file: references/primers/v4/SARS-CoV-2.tsv
+  primers_fasta: references/primers/v4/ARTIC_v4.fasta
+v3:
+  name: SARS-CoV-2 ARTIC V3
+  inserts_bedfile: references/primers/v3/nCoV-2019.insert.bed
+  primers_bedfile: references/primers/v3/nCoV-2019.primer.bed
+  primers_file: references/primers/v3/nCoV-2019.tsv
+  primers_fasta: references/primers/v3/ARTIC_v3.fasta
+

The short name can then be referenced in the samples TSV table file:

sample_a    20211108    250 v3
 sample_b    20220214    250 v4
-

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/sars-cov-2/primers.yaml
-

Type: integer Default: 250

Default for those samples whose read length isn’t specified explicitly in the optional third column of the samples.tsv table.


Example:

100
-

Type: string Default: ""

A bed file with primers position to trim the alignment output

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: individual sample can override this using the 4th column in the samples TSV table file and the protocols YAML look-up table.


Example:

resources/sars-cov-2/primers/v3/nCoV-2019.primer.bed
-

Type: string Default: ""

A bed file with inserts position of the multiplex PCR output to use with amplicon-based analysis.

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: individual sample can override this using the 4th column in the samples TSV table file and the protocols YAML look-up table.


Example:

resources/sars-cov-2/primers/v3/nCoV-2019.primer.bed
-

Type: number Default: 0.8

Using this parameter, the user can specify the read-length threshold that should be applied during the quality trimming as a percentage (0 < trim_percent_cutoff < 1).

Value must be greater or equal to 0 and lesser or equal to 1


Example:

0.9
-

Type: string Default: "results/cohort_consensus.fasta"

Reference sequence to use for the alignment step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default to a reference for that virus.
You are still free to override that default in your configuration shall the need arise.


Examples:

resources/hiv/HXB2.fasta
-
resources/sars-cov-2/NC_045512.2.fasta
-

Type: string Default: ""

A directory containing gff files that can be optionally used to annotate the reference genome in the visualization, e.g., with genes, mature products, protein domains, regions of interests, etc.

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Examples:

resources/hiv/gffs/
-
resources/sars-cov-2/gffs/
-

Type: string Default: ""

An associative array providing user-friendly name to display for each annotation .gff file in the gff_directory

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Examples:

resources/hiv/metainfo.yaml
-
resources/sars-cov-2/metainfo.yaml
-

Type: string Default: ""

The specific annotation GFF file that has provides the genes position along the genome for reports that mention specific genes such frameshift-deletions-check.

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: if not set, V-pipe will try auto-selecting a .gff file from the gff_directory.


Examples:

resources/hiv/gffs/GCF_000864765.1_ViralProj15476_genomic.gff
-
resources/sars-cov-2/gffs/Genes_NC_045512.2.GFF3
-

Type: string Default: ""

A table with primers to display on the visualization

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: individual sample can override this using the 4th column in the samples TSV table file and the protocols YAML look-up table.


Example:

resources/sars-cov-2/primers/v3/nCoV-2019.tsv
-

Type: string Default: ""

Directory holding a list of COJAC YAML definitions of variants of concern that will be used for search of variant signatures

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/sars-cov-2/voc/
-

Type: string Default: ""

A FASTQ file with sequences of interest

Note: These sequences are used, together with the consensus sequence, to build a phylogenetic tree.


Example:

resources/sars-cov-2/phylogeny/selected_covid_sequences.fasta
-

Type: object Default: {}

Properties in this section of the configuration control the output of the pipeline.

Type: string Default: "results"

The workflow will write its output files into this directory. This will follow the same structure as for the input.

For each sample, V-pipe produces several output files that are located in the corresponding sample-specific directory. First, the alignment file and consensus sequences are located in the alignments and references subdirectories, respectively. Second, output files containing SNVs and viral haplotypes are located in the variants subdirectories.

Using the sample example as in the input section, the output files for the two patient samples will be located in the following subdirectories:

📁results
+

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/sars-cov-2/primers.yaml
+

Type: integer Default: 250

Default for those samples whose read length isn’t specified explicitly in the optional third column of the samples.tsv table.


Example:

100
+

Type: string Default: ""

A bed file with primers position to trim the alignment output

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: individual sample can override this using the 4th column in the samples TSV table file and the protocols YAML look-up table.


Example:

resources/sars-cov-2/primers/v3/nCoV-2019.primer.bed
+

Type: string Default: ""

A bed file with inserts position of the multiplex PCR output to use with amplicon-based analysis.

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: individual sample can override this using the 4th column in the samples TSV table file and the protocols YAML look-up table.


Example:

resources/sars-cov-2/primers/v3/nCoV-2019.primer.bed
+

Type: number Default: 0.8

Using this parameter, the user can specify the read-length threshold that should be applied during the quality trimming as a percentage (0 < trim_percent_cutoff < 1).

Value must be greater or equal to 0 and lesser or equal to 1


Example:

0.9
+

Type: string Default: "results/cohort_consensus.fasta"

Reference sequence to use for the alignment step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default to a reference for that virus.
You are still free to override that default in your configuration shall the need arise.


Examples:

resources/hiv/HXB2.fasta
+
resources/sars-cov-2/NC_045512.2.fasta
+

Type: string Default: ""

A directory containing gff files that can be optionally used to annotate the reference genome in the visualization, e.g., with genes, mature products, protein domains, regions of interests, etc.

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Examples:

resources/hiv/gffs/
+
resources/sars-cov-2/gffs/
+

Type: string Default: ""

An associative array providing user-friendly name to display for each annotation .gff file in the gff_directory

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Examples:

resources/hiv/metainfo.yaml
+
resources/sars-cov-2/metainfo.yaml
+

Type: string Default: ""

The specific annotation GFF file that has provides the genes position along the genome for reports that mention specific genes such frameshift-deletions-check.

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: if not set, V-pipe will try auto-selecting a .gff file from the gff_directory.


Examples:

resources/hiv/gffs/GCF_000864765.1_ViralProj15476_genomic.gff
+
resources/sars-cov-2/gffs/Genes_NC_045512.2.GFF3
+

Type: string Default: ""

A table with primers to display on the visualization

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.

Note: individual sample can override this using the 4th column in the samples TSV table file and the protocols YAML look-up table.


Example:

resources/sars-cov-2/primers/v3/nCoV-2019.tsv
+

Type: string Default: ""

Directory holding a list of COJAC YAML definitions of variants of concern that will be used for search of variant signatures

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/sars-cov-2/voc/
+

Type: string Default: ""

A FASTQ file with sequences of interest

Note: These sequences are used, together with the consensus sequence, to build a phylogenetic tree.


Example:

resources/sars-cov-2/phylogeny/selected_covid_sequences.fasta
+

Type: object Default: {}

Properties in this section of the configuration control the output of the pipeline.

Type: string Default: "results"

The workflow will write its output files into this directory. This will follow the same structure as for the input.

For each sample, V-pipe produces several output files that are located in the corresponding sample-specific directory. First, the alignment file and consensus sequences are located in the alignments and references subdirectories, respectively. Second, output files containing SNVs and viral haplotypes are located in the variants subdirectories.

Using the sample example as in the input section, the output files for the two patient samples will be located in the following subdirectories:

📁results
 ├──📁patient1
 │  ├──📁20100113
 │  │  ├──📁alignments
@@ -117,72 +117,74 @@
      |  └──snvs.vcf
      └──📁global
         └──contigs_stage_c.fasta
-
  • Standardized Snakemake workflows place their output in a results subdirectory
  • If your prefer the output written, e.g., to the same samples/ subdirectory as the input (as used to be done in legacy V-pipe v1.x/2.x), you can use this options you can specify alternate target locations.

Example:

samples
-

Type: string Default: ""

In addition, V-pipe can optionally generate a few cohort-wide results, such as a current cohort consensus fasta file, or a TSV file containing the frequencies of all minor alleles that differ from the consensus among analyzed samples.
By default, these output files are located at the base of the output datadir, outside of the two-level per sample structure:

results
+
  • Standardized Snakemake workflows place their output in a results subdirectory
  • If your prefer the output written, e.g., to the same samples/ subdirectory as the input (as used to be done in legacy V-pipe v1.x/2.x), you can use this options you can specify alternate target locations.

Example:

samples
+

Type: string Default: ""

In addition, V-pipe can optionally generate a few cohort-wide results, such as a current cohort consensus fasta file, or a TSV file containing the frequencies of all minor alleles that differ from the consensus among analyzed samples.
By default, these output files are located at the base of the output datadir, outside of the two-level per sample structure:

results
 ├──minority_variants.tsv
 ├──cohort_consensus.fasta
 ├──patient1
 │  ├──20100113
 │  │  ├──alignments
 
-

If you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the datadirs, you can use this options you can specify alternate subdirectory relative to the datadir property. (Use .. prefix if you want instead your cohort-wide results to be in a directory at the sample level as samples/ and results/. See the example below to recreate the variants/ directory used by legacy V-pipe v1.x/2.x).


Example:

../variants
-

Type: boolean Default: false

V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample’s consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample’s …/references/frameshift_deletions_check.tsv. Such reports can be useful when submitting sequences to GISAID.

This option turns on such QA features.


Example:

True
-

Type: boolean Default: false

This option indicates that the samples come from PCR amplification and the primers should be trimmed from amplicons in the alignment file. The trimmed read are written to each sample’s …/variants/SNVs/REF_aln_trim.bam.

Using this option requires either specifying a primers bed file in input => protocols_file, or using a 4 column input samples TSV file and specify a protocol look-up YAML file in input => protocols_file.


Example:

True
-

Type: boolean Default: false

This option selects whether the SNV caller step should be executed and its output written to each sample’s …/variants/SNVs/snvs.csv.


Example:

True
-

Type: boolean Default: false

This option activates local haplotype reconstruction (only available when using ShoRAH).


Example:

True
-

Type: boolean Default: false

This option turns on global haplotype reconstruction.


Example:

True
-

Type: boolean Default: false

This option selects whether to generate HTML visualization of the SNVs in each sample’s …/visualization/index.html.


Example:

True
-

Type: boolean Default: false

This option turns on the computation of diversity measures in each sample.


Example:

True
-

Type: boolean Default: false

This option turns on dehumanization of the raw reads (i.e. removal of host’s reads) and generates the file dehuman.cram. This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive).

This only applies to the upload and does not affect the main workflow.


Example:

True
-

Type: boolean Default: false

This option can be used for assistance in incremental upload of data. See section upload for an example.


Example:

True
-

Type: object Default: {}

The path to the different software packages can be specified using this section.

It is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the PATH.

Note we strongly recommend to use conda environments, by adding the --use-conda flag to the V-pipe execution command, e.g. ./vpipe --use-conda. If you prefer to use your own installations, this section allows you to specify the location of the executable files.


Example:

bwa: /path/to/bwa
-haploclique: /path/to/haploclique
-

Type: string Default: "gunzip"

Type: string Default: "zstd"

Type: string Default: "xsv"

Type: string Default: "prinseq-lite.pl"

Type: string Default: "fastqc"

Type: string Default: "vicuna"

Due to a special license, VICUNA is not available from bioconda and must be installed from its original website.
Use this option to specify where you have installed its executable.

Type: string Default: "InDelFixer"

Type: string Default: "ConsensusFixer"

Type: string Default: "picard"

Type: string Default: "bwa"

Type: string Default: "bowtie2-build"

Type: string Default: "bowtie2"

Type: string Default: "minimap2"

Type: string Default: "samtools"

Type: string Default: "extract_consensus"

Type: string Default: "matcher"

Type: string Default: "frameshift_deletions_checks"

Type: string Default: "mafft"

Type: string Default: "ngshmmalign"

Type: string Default: "convert_reference"

Type: string Default: "extract_seq"

Type: string Default: "coverage_stats"

Type: string Default: "remove_gaps_msa"

Type: string Default: "ivar"

Type: string Default: "aln2basecnt"

Type: string Default: "gather_coverage"

Type: string Default: "minority_freq"

Type: string Default: "extract_coverage_intervals"

Type: string Default: "shorah shotgun"

Type: string Default: "lofreq"

Type: string Default: "bcftools"

Type: string Default: "haploclique"

Type: string Default: "compute_mds"

Type: string Default: "savage"

Type: string Default: "predicthaplo"

Type: string Default: "cojac"

Type: string Default: "lollipop"

Type: object Default: {}

Type: integer Default: 32

Type: integer Default: 60

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 20

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/preprocessing.yaml"

Type: string Default: "-ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10"

We use software PRINSEQ 1 for quality control. By default, we use options -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10, which indicates to trim reads using a sliding window with size 10 bp, and trim bases if their quality scores are less than 30. Additionally, reads are filtered out if the average quality score is below 30 and if they contain more than 4 N’s. The user can choose to overwrite the default settings or use additional parameters by using the property extra. E.g., if many reads are filtered out in this step, the user can choose to lower the quality threshold as indicated in the example.
Please do not modify PRINSEQ options -out_format, -out_good, nor -min_len. Instead of using -min_len to define threshold on the read length after trimming, use input => trim_percent_cutoff.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 


Example:

-ns_max_n 4 -min_qual_mean 20 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_window 10
-

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/fastqc.yaml"

Type: integer Default: 6

Type: boolean Default: false

Type: object Default: {}

NOTE The conda environment for this rule doesn’t work properly. The package on the bioconda channel, mvicuna, is slightly different from VICUNA and it has different command-line arguments. Moreover, VICUNA and mvicuna are no longer maintained. In the future, this rule will be deprecated.

Type: integer Default: 1000

Type: integer Default: 600

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna.yaml"

Type: object Default: {}

NOTE Obtaining a initial reference de novo is implemented for more than one sample.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna_msa.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/hmm_align.yaml"

Type: boolean Default: false

This option is useful for debugging purposes.


Example:

True
-

Type: string Default: ""

Pass additional options to run ngshmmalign

V-pipe uses option -R <path/to/initial_reference>, thus option -r arg is not allowed. Also, instead of passing -l via the property extra, set leave_msa_temp to True. Lastly, please do not modify options -o arg, -w arg, -t arg, and -N arg. These are already managed by V-pipe.

Type: object Default: {}

Type: integer Default: 5000

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_QA.yaml"

Type: string Default: ""

Panel of diverse references against which to align reads as a QA step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/hiv/5-Virus-Mix.fasta
-

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: "HXB2:6614-6812,7109-7217,7376-7478,7601-7634"

Type: object Default: {}

This rule takes all previously aligned reads by hmm_align. Therefore, resources should be allocated accordingly.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/msa.yaml"

Type: object Default: {}

Type: integer Default: 8000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: enum (of string) Default: "--phred33"

Indicate if qualities are Phred+33 (default) or Phred+64 (--phred64).

Must be one of:

  • "--phred33"
  • "--phred64"

Example:

--phred64
-

Type: string Default: "--local --sensitive-local"

Specify Bowtie 2 presets.

Type: integer

Type: string Default: ""

Pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options
For more details on Bowtie 2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/minimap_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: enum (of string) Default: "sr"

Specify minimap2 preset options. See minimape’s documentation for details about each of the presets.

Must be one of:

  • "map-ont"
  • "map-hifi"
  • "map-pb"
  • "asm5"
  • "asm10"
  • "asm20"
  • "splice"
  • "splice:hq"
  • "sr"
  • "ava-pb"
  • "ava-ont"

Example:

map-ont
-

Type: boolean Default: false

By default V-pipe ignores Minimap2’s secondary alignment(s) and only considers the primary one (A secondary alignment occurs when a given read could align reasonably well to more than one place). This flags turns back on Minimap2’s secondary alignments, and also includes their sequences in the output BAM file.


Example:

True
-

Type: string Default: "{VPIPE_BASEDIR}/envs/minimap_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run minimap2. For more details on minimap2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/primerstrim.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 50

Minimum read depth for reporting variants per locus.

Type: integer Default: 5

Read count below which ambiguous base ’n’ is reported.

Type: integer Default: 15

Minimum phred quality score for a base to be included.

Type: number Default: 0.05

Minimum frequency for an ambiguous nucleotide.

Value must be greater or equal to 0 and lesser or equal to 1

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bcftools.yaml"

Type: integer Default: 10000

Type: integer Default: 10

Type: number Default: 0.05

Value must be greater or equal to 0 and lesser or equal to 1

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/consseq_qa.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 100

Minimum read depth for reporting variants per locus.


Example:

50
-

Type: boolean Default: false

Output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples.


Example:

True
-

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: boolean Default: false

Construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth.


Example:

True
-

Type: integer Default: 50

Minimum read depth. A region spanning the reference genome is returned if coverage is set to 0.


Example:

0
-

Type: boolean Default: true

Indicate whether to apply a more liberal shifting on intervals’ right-endpoint.


Example:

False
-

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 2880

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/snv.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

False
-

Type: number Default: 0.1

Hyperparameter used for instantiating a new cluster.

Type: boolean Default: false

Ignore SNVs adjacent to indels.

Type: number Default: 0.9

Value must be greater or equal to 0 and lesser or equal to 1

Type: integer Default: 0

Omit windows with coverage less than this value.


Example:

50
-

Type: integer Default: 3

ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows.

Type: boolean Default: false

Indicate whether to move files produced in previous/interrupted runs to subdirectory named old


Example:

True
-

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 20

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

False
-

Type: string Default: ""

Pass additional options to run lofreq call

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 5

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

NOTE This rule only works in Linux.

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/savage.yaml"

Type: integer Default: 20

Size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/split < 1000.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: string Default: "{VPIPE_BASEDIR}/envs/haploclique.yaml"

Type: boolean Default: true

If set to True (default) a predefined set of parameter values is used for drawing edges between reads in the read graph.

Type: boolean Default: true

Singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to True, singletons are discarded.

Type: boolean Default: true

If set to True (default) probability of the overhangs is ignored.

Type: integer Default: 3

Sets a threshold to limit the size of cliques.

Type: integer Default: 10000

Indicates the maximum number of clique to be considered in the next iteration.

Type: string Default: ""

Additional parameters to be passed to haploclique.

Warning: this won’t overwrite the other options (e.g. clique_size_limi and max_num_cliques should still be set via their own respective properties, do not pass parameters --limit_clique_size= nor --max_cliques= via this extra property).

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 0

Use to specify a region of interest.

Type: integer Default: -1

Use to specify a region of interest.


Examples:

9719
-
29836
-

Type: string Default: ""

When the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: integer Default: 0

Type: string Default: "{VPIPE_BASEDIR}/envs/predicthaplo.yaml"

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 10

Type: integer Default: 2

Minimal number of coocurrences to search for in amplicon. Lowering this property to 1 will make COJAC also look for amplicon with singleton mutations.


Example:

1
-

Type: object Default: {}

Type: integer Default: 8192

Type: integer Default: 45

Type: integer Default: 1

Type: string Default: "{VPIPE_BASEDIR}/envs/cojac.yaml"

Type: enum (of string) Default: "lines"

Format of the output CSV.

  • lines(default) - each amplicon a separate entry on a separate line.
  • columns - one column per amplicon

Must be one of:

  • "lines"
  • "columns"

Example:

columns
-

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 10

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 15

Type: object Default: {}

This section is used to set up a timeline of the samples. Some output, e.g., deconvolution of quasispecies mixture using LolliPop, need to have a time component. By default it calls a script that uses regular expressions and look-up tables to extract this information from the samples’ own names. But by using the properties script and options and adapting the environment provided in property conda, it is possible to heavily customize the actions (e.g. it is possible to query an external database instead). For inspiration, see the default script file_parser.py.

Type: integer Default: 1024

Type: integer Default: 15

Type: string Default: "{VPIPE_BASEDIR}/envs/timeline.yaml"

The default environment only provides regular expression functions (python-reges) but depending on your needs you would want to provide a custom environment with additional tools (e.g. drivers to query a databse, etc.)

Type: integer Default: 1

Type: boolean Default: true

Don’t dispatch the timeline rule to the cluster for execution, run locally.


Example:

False
+

If you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the datadirs, you can use this options you can specify alternate subdirectory relative to the datadir property. (Use .. prefix if you want instead your cohort-wide results to be in a directory at the sample level as samples/ and results/. See the example below to recreate the variants/ directory used by legacy V-pipe v1.x/2.x).


Example:

../variants
+

Type: boolean Default: false

V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample’s consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample’s …/references/frameshift_deletions_check.tsv. Such reports can be useful when submitting sequences to GISAID.

This option turns on such QA features.


Example:

True
+

Type: boolean Default: false

This option indicates that the samples come from PCR amplification and the primers should be trimmed from amplicons in the alignment file. The trimmed read are written to each sample’s …/variants/SNVs/REF_aln_trim.bam.

Using this option requires either specifying a primers bed file in input => protocols_file, or using a 4 column input samples TSV file and specify a protocol look-up YAML file in input => protocols_file.


Example:

True
+

Type: boolean Default: false

This option selects whether the SNV caller step should be executed and its output written to each sample’s …/variants/SNVs/snvs.csv.


Example:

True
+

Type: boolean Default: false

This option activates local haplotype reconstruction (only available when using ShoRAH or VILOCA).


Example:

True
+

Type: boolean Default: false

This option turns on global haplotype reconstruction.


Example:

True
+

Type: boolean Default: false

This option selects whether to generate HTML visualization of the SNVs in each sample’s …/visualization/index.html.


Example:

True
+

Type: boolean Default: false

This option turns on the computation of diversity measures in each sample.


Example:

True
+

Type: boolean Default: false

This option turns on dehumanization of the raw reads (i.e. removal of host’s reads) and generates the file dehuman.cram. This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive).

This only applies to the upload and does not affect the main workflow.


Example:

True
+

Type: boolean Default: false

This option can be used for assistance in incremental upload of data. See section upload for an example.


Example:

True
+

Type: object Default: {}

The path to the different software packages can be specified using this section.

It is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the PATH.

Note we strongly recommend to use conda environments, by adding the --use-conda flag to the V-pipe execution command, e.g. ./vpipe --use-conda. If you prefer to use your own installations, this section allows you to specify the location of the executable files.


Example:

bwa: /path/to/bwa
+haploclique: /path/to/haploclique
+

Type: string Default: "gunzip"

Type: string Default: "zstd"

Type: string Default: "xsv"

Type: string Default: "prinseq-lite.pl"

Type: string Default: "fastqc"

Type: string Default: "vicuna"

Due to a special license, VICUNA is not available from bioconda and must be installed from its original website.
Use this option to specify where you have installed its executable.

Type: string Default: "InDelFixer"

Type: string Default: "ConsensusFixer"

Type: string Default: "picard"

Type: string Default: "bwa"

Type: string Default: "bowtie2-build"

Type: string Default: "bowtie2"

Type: string Default: "minimap2"

Type: string Default: "samtools"

Type: string Default: "extract_consensus"

Type: string Default: "matcher"

Type: string Default: "frameshift_deletions_checks"

Type: string Default: "paired_end_read_merger"

Type: string Default: "mafft"

Type: string Default: "ngshmmalign"

Type: string Default: "convert_reference"

Type: string Default: "extract_seq"

Type: string Default: "coverage_stats"

Type: string Default: "remove_gaps_msa"

Type: string Default: "ivar"

Type: string Default: "aln2basecnt"

Type: string Default: "gather_coverage"

Type: string Default: "minority_freq"

Type: string Default: "extract_coverage_intervals"

Type: string Default: "shorah shotgun"

Type: string Default: "viloca run"

Type: string Default: "lofreq"

Type: string Default: "bcftools"

Type: string Default: "haploclique"

Type: string Default: "compute_mds"

Type: string Default: "savage"

Type: string Default: "predicthaplo"

Type: string Default: "cojac"

Type: string Default: "lollipop"

Type: object Default: {}

Type: integer Default: 32

Type: integer Default: 60

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 20

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/preprocessing.yaml"

Type: string Default: "-ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10"

We use software PRINSEQ 1 for quality control. By default, we use options -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10, which indicates to trim reads using a sliding window with size 10 bp, and trim bases if their quality scores are less than 30. Additionally, reads are filtered out if the average quality score is below 30 and if they contain more than 4 N’s. The user can choose to overwrite the default settings or use additional parameters by using the property extra. E.g., if many reads are filtered out in this step, the user can choose to lower the quality threshold as indicated in the example.
Please do not modify PRINSEQ options -out_format, -out_good, nor -min_len. Instead of using -min_len to define threshold on the read length after trimming, use input => trim_percent_cutoff.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 


Example:

-ns_max_n 4 -min_qual_mean 20 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_window 10
+

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/fastqc.yaml"

Type: integer Default: 6

Type: boolean Default: false

Type: object Default: {}

NOTE The conda environment for this rule doesn’t work properly. The package on the bioconda channel, mvicuna, is slightly different from VICUNA and it has different command-line arguments. Moreover, VICUNA and mvicuna are no longer maintained. In the future, this rule will be deprecated.

Type: integer Default: 1000

Type: integer Default: 600

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna.yaml"

Type: object Default: {}

NOTE Obtaining a initial reference de novo is implemented for more than one sample.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna_msa.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/hmm_align.yaml"

Type: boolean Default: false

This option is useful for debugging purposes.


Example:

True
+

Type: string Default: ""

Pass additional options to run ngshmmalign

V-pipe uses option -R <path/to/initial_reference>, thus option -r arg is not allowed. Also, instead of passing -l via the property extra, set leave_msa_temp to True. Lastly, please do not modify options -o arg, -w arg, -t arg, and -N arg. These are already managed by V-pipe.

Type: object Default: {}

Type: integer Default: 5000

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_QA.yaml"

Type: string Default: ""

Panel of diverse references against which to align reads as a QA step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/hiv/5-Virus-Mix.fasta
+

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: "HXB2:6614-6812,7109-7217,7376-7478,7601-7634"

Type: object Default: {}

This rule takes all previously aligned reads by hmm_align. Therefore, resources should be allocated accordingly.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/msa.yaml"

Type: object Default: {}

Type: integer Default: 8000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: enum (of string) Default: "--phred33"

Indicate if qualities are Phred+33 (default) or Phred+64 (--phred64).

Must be one of:

  • "--phred33"
  • "--phred64"

Example:

--phred64
+

Type: string Default: "--local --sensitive-local"

Specify Bowtie 2 presets.

Type: integer

Type: string Default: ""

Pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options
For more details on Bowtie 2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/minimap_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: enum (of string) Default: "sr"

Specify minimap2 preset options. See minimape’s documentation for details about each of the presets.

Must be one of:

  • "map-ont"
  • "map-hifi"
  • "map-pb"
  • "asm5"
  • "asm10"
  • "asm20"
  • "splice"
  • "splice:hq"
  • "sr"
  • "ava-pb"
  • "ava-ont"

Example:

map-ont
+

Type: boolean Default: false

By default V-pipe ignores Minimap2’s secondary alignment(s) and only considers the primary one (A secondary alignment occurs when a given read could align reasonably well to more than one place). This flags turns back on Minimap2’s secondary alignments, and also includes their sequences in the output BAM file.


Example:

True
+

Type: string Default: "{VPIPE_BASEDIR}/envs/minimap_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run minimap2. For more details on minimap2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/primerstrim.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 50

Minimum read depth for reporting variants per locus.

Type: integer Default: 5

Read count below which ambiguous base ’n’ is reported.

Type: integer Default: 15

Minimum phred quality score for a base to be included.

Type: number Default: 0.05

Minimum frequency for an ambiguous nucleotide.

Value must be greater or equal to 0 and lesser or equal to 1

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bcftools.yaml"

Type: integer Default: 10000

Type: integer Default: 10

Type: number Default: 0.05

Value must be greater or equal to 0 and lesser or equal to 1

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/consseq_qa.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/paired_end_read_merger.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 100

Minimum read depth for reporting variants per locus.


Example:

50
+

Type: boolean Default: false

Output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples.


Example:

True
+

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: boolean Default: false

Construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth.


Example:

True
+

Type: integer Default: 50

Minimum read depth. A region spanning the reference genome is returned if coverage is set to 0.


Example:

0
+

Type: boolean Default: true

Indicate whether to apply a more liberal shifting on intervals’ right-endpoint.


Example:

False
+

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 2880

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/snv.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

False
+

Type: number Default: 0.1

Hyperparameter used for instantiating a new cluster.

Type: boolean Default: false

Ignore SNVs adjacent to indels.

Type: number Default: 0.9

Value must be greater or equal to 0 and lesser or equal to 1

Type: integer Default: 0

Omit windows with coverage less than this value.


Example:

50
+

Type: integer Default: 3

ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows.

Type: boolean Default: false

Indicate whether to move files produced in previous/interrupted runs to subdirectory named old


Example:

True
+

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 20

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: object Default: {}

Type: integer Default: 10000

Type: integer

Type: integer Default: 2880

Type: string Default: "{VPIPE_BASEDIR}/envs/viloca.yaml"

Type: boolean Default: false

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

False
+

Type: boolean Default: false

Merge paired-end reads in the preprocessing. This is a preprocessing snakemake rule.


Example:

False
+

Type: integer Default: 3

VILOCA performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows.

Type: string Default: "None"

VILOCA performs local haplotype reconstruction on windows of the read alignment. In a first step the alignment is tiled into local regions. This can be done uniformly then set this value None, otherwise path to an (optional) insert file (primer tiling strategy)

Type: enum (of string) Default: "use_quality_scores"

Mode in which to run VILOCA: shorah, learnerrorparams, usequalityscores. If quality scores are available, we recommend this option

Must be one of:

  • "shorah"
  • "learn_error_params"
  • "use_quality_scores"

Type: string Default: ""

Pass additional options to run viloca

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

False
+

Type: string Default: ""

Pass additional options to run lofreq call

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 5

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

NOTE This rule only works in Linux.

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/savage.yaml"

Type: integer Default: 20

Size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/split < 1000.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: string Default: "{VPIPE_BASEDIR}/envs/haploclique.yaml"

Type: boolean Default: true

If set to True (default) a predefined set of parameter values is used for drawing edges between reads in the read graph.

Type: boolean Default: true

Singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to True, singletons are discarded.

Type: boolean Default: true

If set to True (default) probability of the overhangs is ignored.

Type: integer Default: 3

Sets a threshold to limit the size of cliques.

Type: integer Default: 10000

Indicates the maximum number of clique to be considered in the next iteration.

Type: string Default: ""

Additional parameters to be passed to haploclique.

Warning: this won’t overwrite the other options (e.g. clique_size_limi and max_num_cliques should still be set via their own respective properties, do not pass parameters --limit_clique_size= nor --max_cliques= via this extra property).

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 0

Use to specify a region of interest.

Type: integer Default: -1

Use to specify a region of interest.


Examples:

9719
+
29836
+

Type: string Default: ""

When the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: integer Default: 0

Type: string Default: "{VPIPE_BASEDIR}/envs/predicthaplo.yaml"

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 10

Type: integer Default: 2

Minimal number of coocurrences to search for in amplicon. Lowering this property to 1 will make COJAC also look for amplicon with singleton mutations.


Example:

1
+

Type: object Default: {}

Type: integer Default: 8192

Type: integer Default: 45

Type: integer Default: 1

Type: string Default: "{VPIPE_BASEDIR}/envs/cojac.yaml"

Type: enum (of string) Default: "lines"

Format of the output CSV.

  • lines(default) - each amplicon a separate entry on a separate line.
  • columns - one column per amplicon

Must be one of:

  • "lines"
  • "columns"

Example:

columns
+

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 10

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 15

Type: object Default: {}

This section is used to set up a timeline of the samples. Some output, e.g., deconvolution of quasispecies mixture using LolliPop, need to have a time component. By default it calls a script that uses regular expressions and look-up tables to extract this information from the samples’ own names. But by using the properties script and options and adapting the environment provided in property conda, it is possible to heavily customize the actions (e.g. it is possible to query an external database instead). For inspiration, see the default script file_parser.py.

Type: integer Default: 1024

Type: integer Default: 15

Type: string Default: "{VPIPE_BASEDIR}/envs/timeline.yaml"

The default environment only provides regular expression functions (python-reges) but depending on your needs you would want to provide a custom environment with additional tools (e.g. drivers to query a databse, etc.)

Type: integer Default: 1

Type: boolean Default: true

Don’t dispatch the timeline rule to the cluster for execution, run locally.


Example:

False
 

Type: string Default: "{VPIPE_BASEDIR}/scripts/file_parser.py"

Script that sets up a timeline of the samples.

Its purpose is to take the V-pipe’s samples TSV file and add two columns:

  • location: location of the sampes
  • date: sampling date of the samples

It will receive the following parameters (in addition to what is specified in property options:

  • –output <OUTPUT>: the output TSV file that must be created by the script.
  • <SAMPLE_TSV>: the input samples TSV file

For an example, see the default script file_parser.py, it uses regular expressions (regex) to parse the first two columns (sample and batch names) and extract a date, and a location code that is further look-ed up in a table. It takes two additional parameters:

  • –locations <LOOKUP>: look-up TSV table mapping the code to full location names
  • –regex_yaml <YAML>: YAML file with regular expressions defining how the date and code are extracted.

Type: string Default: " --no-fallback"

Additional options to be passed to the script, e.g. for an extra configuration file with database server information.

By default, passes an option to the default script to force always using the regex (do not fall back to copy-pasting columns).

Type: string Default: ""

Option for the default script: TSV table that maps location codes (e.g. short alphanumeric codes) used in sample names to full names of locations (e.g. city names).

For example:

code    location
 10  Zürich (ZH)
 16  Genève (GE)
 Ba  Basel (BS)
-

Example:

wastewater_plants.tsv
-

Type: string Default: ""

Option for the default script: YAML file the defines how to parse time series information out of the columns of samples.tsv, e.g.:

sample: (?P<location>\d+)_(?P<year>20\d{2})_(?P<month>[01]?\d)_(?P<day>[0-3]?\d)
-datefmt: "%Y%m%d"
-
  • by default, samples.tsv’s first column (sample names) is used as-is for location codes and the second column (sequencing batch dates) is used for timeline’s date point.
  • YAML’s optional entry sample defines a regular expression to be applied on the first column (sample names)
  • YAML’s optional entry batch defines a regular expression to be applied on the second column (sequencing batch dates)
  • regular expression must define the following named capturing groups:
    • location used for the location codes
    • year, month, day used for the dates of the timeline
    • alternatively, if the date doesn’t use a format with year + month + day elements – e.g., date uses week number of the year – group date can be used to capture the whole date string
  • YAML’s optional entry datefmt gives a time format string to parse the date capturing group.

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/xsv.yaml"

Type: string Default: ""

If set, this user-provided TSV file (e.g.: generated with an external tool, prior of running V-pipe) will be used for obtaining locations and dates – as needed by LolliPop – instead of generating results/timeline.tsv with the rule timeline.
This follows the following format (similar to the output of rule timeline):

sample  batch   reads   proto   location_code   date    location
+

Example:

wastewater_plants.tsv
+

Type: string Default: ""

Option for the default script: YAML file the defines how to parse time series information out of the columns of samples.tsv, e.g.:

sample: (?P<location>\d+)_(?P<year>20\d{2})_(?P<month>[01]?\d)_(?P<day>[0-3]?\d)
+datefmt: "%Y%m%d"
+
  • by default, samples.tsv’s first column (sample names) is used as-is for location codes and the second column (sequencing batch dates) is used for timeline’s date point.
  • YAML’s optional entry sample defines a regular expression to be applied on the first column (sample names)
  • YAML’s optional entry batch defines a regular expression to be applied on the second column (sequencing batch dates)
  • regular expression must define the following named capturing groups:

    • location used for the location codes

    • year, month, day used for the dates of the timeline

    • alternatively, if the date doesn’t use a format with year + month + day elements – e.g., date uses week number of the year – group date can be used to capture the whole date string

  • YAML’s optional entry datefmt gives a time format string to parse the date capturing group.

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/xsv.yaml"

Type: string Default: ""

If set, this user-provided TSV file (e.g.: generated with an external tool, prior of running V-pipe) will be used for obtaining locations and dates – as needed by LolliPop – instead of generating results/timeline.tsv with the rule timeline.
This follows the following format (similar to the output of rule timeline):

sample  batch   reads   proto   location_code   date    location
 A1_05_2023_04_12    20230428_HNG5MDRX2  250 v41 5   2023-04-12  Lugano (TI)
 A2_10_2023_04_13    20230428_HNG5MDRX2  250 v41 10  2023-04-13  Zürich (ZH)
 A3_16_2023_04_14    20230428_HNG5MDRX2  250 v41 16  2023-04-14  Genève (GE)
 …
-
  • The extra columns location and data are necessary for LolliPop.
  • Columns sample, batch, reads and proto are simply the fist four columns of samples.tsv
    • V-pipe only needs column sample and batch for now.

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 240

Type: integer Default: 4

Type: string Default: "{VPIPE_BASEDIR}/envs/lollipop.yaml"

Type: string Default: "{VPIPE_BASEDIR}/../resources/cowwid/deconv_linear_logit_quasi_strat.yaml"

Configuration file with parameters for kernel deconvolution


Examples:

/git/lollipop/deconv_linear_logit_quasi_strat.yaml
-
/git/lollipop/deconv_linear_wald.yaml
-
/git/lollipop/deconv_bootstrap
-

Type: string Default: ""

Variants configuration used during deconvolution


Example:

var_conf.yaml
-

Type: string Default: ""

Variants to scan per periods (as determined with COJAC by leveraging the output of the cooc rule)


Example:

var_dates.yaml
-

Type: enum (of string) Default: "lines"

Format of the output CSV.

  • lines(default) - each variants a separate entry on a separate line.
  • columns - one column per variant

Must be one of:

  • "lines"
  • "columns"

Example:

columns
-

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/visualization.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/diversity_measures.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/dehuman.yaml"

Type: integer Default: 4096

Type: integer Default: 235

Type: integer Default: 4

Type: string Default: "references/human.fa.gz"

Host’s genome used to remove reads (e.g. human genome)

Note: if this file is absent, it is possible to fetch it from a remote server, see property ref_host_url below.


Example:

/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa
-

Type: string Default: "http://ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"

If the host’s genome specified in property ref_host isn’t present, fetch it from a remote server.

Note remember to set aside enough memory for the indexing rule, see section ref_bwa_index property mem.


Examples:

http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
-
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz
-

Type: boolean Default: false

Indicate whether to store the host-aligned reads in a CRAM file …/alignments/host_aln.cram.


Example:

True
-

Type: boolean Default: false

Use this option when generating dehumanized raw reads (dehuman.cram) on old samples that have already been processed in the past — a catch up.

Normally, removing host-mapping reads requires analyzing reads which were rejected by V-pipe’s main processing (as specified in section general, property aligner). But this output is considered temporary and will get deleted by Snakemake once the processing of a sample has finished. To generate dehuman.cram V-pipe would need to run the aligner again, which will both regenerate the data necessary for this output but also generate a new alignment which will trigger the whole workflow again.
Use this property catchup to only generate the input necessary for dehuman.cram, leaving untouched the alignment and everything else that has already been processed.


Example:

True
-

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

Type: object Default: {}

This section is used to assist and prepare uploads of the data, e.g. to European Nucleotide Archive. By default it calls a script that creates symlinks making it easy to identify new/updated samples between calls of V-pipe. But by using the properties script and options and adapting the environment provided in property conda, it is possible to heavily customize the actions (e.g. it is possible to upload to an SFTP server by calling sftp from a modified script). For inspiration, see the default script prepare_upload_symlinks.sh.

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

The default environment only provides hashing functions (xxhash, linux coreutils’ sha{nnn}sum collection, etc.) but depending on your needs you would want to provide a custom environment with additional tools (e.g. sftp, rsync, curl, lftp, custom specialized cloud uploaders, etc.)

Type: integer Default: 1

Type: boolean Default: true

Don’t dispatch the rule to the cluster for execution, run locally.


Example:

False
-

Type: enum (of string) Default: "ambig"

When preparing data for upload, specifies which consensus sequence should be uploaded.

Must be one of:

  • "ambig"
  • "majority"

Example:

majority
-

Type: boolean Default: false

Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous).


Example:

True
-

Type: boolean Default: false

Also include the original .fastq.gz sequencing reads files from raw_data/ in the list of files to be uploaded. See property orig_cram below for a compressed version and see output dehumanized_raw_reads and section dehuman for depleting reads from the host.


Example:

True
-

Type: boolean Default: false

Also include a compressed version of the original sequencing raw reads files from raw_data/. Similar to property orig_fastq above, but with reference-based compression.


Example:

True
-

Type: string Default: "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh"

Custom script that assists and prepares uploads.

It will receive the following positional parameters:

  • <OUTPUT>: the output file that must be created by the script.
  • <SAMPLE_ID>: a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.
  • <SAMPLE_DIR>: the base directory of the sample.
  • <UPLOAD_FILES>…: a list of files to consider for upload

For an example, see the default script prepare_upload_symlinks.sh, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.

Type: string Default: ""

Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.

\ No newline at end of file +
  • The extra columns location and data are necessary for LolliPop.
  • Columns sample, batch, reads and proto are simply the fist four columns of samples.tsv

    • V-pipe only needs column sample and batch for now.

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 240

Type: integer Default: 4

Type: string Default: "{VPIPE_BASEDIR}/envs/lollipop.yaml"

Type: string Default: "{VPIPE_BASEDIR}/../resources/cowwid/deconv_linear_logit_quasi_strat.yaml"

Configuration file with parameters for kernel deconvolution


Examples:

/git/lollipop/deconv_linear_logit_quasi_strat.yaml
+
/git/lollipop/deconv_linear_wald.yaml
+
/git/lollipop/deconv_bootstrap
+

Type: string Default: ""

Variants configuration used during deconvolution


Example:

var_conf.yaml
+

Type: string Default: ""

Variants to scan per periods (as determined with COJAC by leveraging the output of the cooc rule)


Example:

var_dates.yaml
+

Type: enum (of string) Default: "lines"

Format of the output CSV.

  • lines(default) - each variants a separate entry on a separate line.
  • columns - one column per variant

Must be one of:

  • "lines"
  • "columns"

Example:

columns
+

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/visualization.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/diversity_measures.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/dehuman.yaml"

Type: integer Default: 4096

Type: integer Default: 235

Type: integer Default: 4

Type: string Default: "references/human.fa.gz"

Host’s genome used to remove reads (e.g. human genome)

Note: if this file is absent, it is possible to fetch it from a remote server, see property ref_host_url below.


Example:

/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa
+

Type: string Default: "http://ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"

If the host’s genome specified in property ref_host isn’t present, fetch it from a remote server.

Note remember to set aside enough memory for the indexing rule, see section ref_bwa_index property mem.


Examples:

http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
+
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz
+

Type: boolean Default: false

Indicate whether to store the host-aligned reads in a CRAM file …/alignments/host_aln.cram.


Example:

True
+

Type: boolean Default: false

Use this option when generating dehumanized raw reads (dehuman.cram) on old samples that have already been processed in the past — a catch up.

Normally, removing host-mapping reads requires analyzing reads which were rejected by V-pipe’s main processing (as specified in section general, property aligner). But this output is considered temporary and will get deleted by Snakemake once the processing of a sample has finished. To generate dehuman.cram V-pipe would need to run the aligner again, which will both regenerate the data necessary for this output but also generate a new alignment which will trigger the whole workflow again.
Use this property catchup to only generate the input necessary for dehuman.cram, leaving untouched the alignment and everything else that has already been processed.


Example:

True
+

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

Type: object Default: {}

This section is used to assist and prepare uploads of the data, e.g. to European Nucleotide Archive. By default it calls a script that creates symlinks making it easy to identify new/updated samples between calls of V-pipe. But by using the properties script and options and adapting the environment provided in property conda, it is possible to heavily customize the actions (e.g. it is possible to upload to an SFTP server by calling sftp from a modified script). For inspiration, see the default script prepare_upload_symlinks.sh.

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

The default environment only provides hashing functions (xxhash, linux coreutils’ sha{nnn}sum collection, etc.) but depending on your needs you would want to provide a custom environment with additional tools (e.g. sftp, rsync, curl, lftp, custom specialized cloud uploaders, etc.)

Type: integer Default: 1

Type: boolean Default: true

Don’t dispatch the rule to the cluster for execution, run locally.


Example:

False
+

Type: enum (of string) Default: "ambig"

When preparing data for upload, specifies which consensus sequence should be uploaded.

Must be one of:

  • "ambig"
  • "majority"

Example:

majority
+

Type: boolean Default: false

Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous).


Example:

True
+

Type: boolean Default: false

Also include the original .fastq.gz sequencing reads files from raw_data/ in the list of files to be uploaded. See property orig_cram below for a compressed version and see output dehumanized_raw_reads and section dehuman for depleting reads from the host.


Example:

True
+

Type: boolean Default: false

Also include a compressed version of the original sequencing raw reads files from raw_data/. Similar to property orig_fastq above, but with reference-based compression.


Example:

True
+

Type: string Default: "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh"

Custom script that assists and prepares uploads.

It will receive the following positional parameters:

  • <OUTPUT>: the output file that must be created by the script.
  • <SAMPLE_ID>: a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.
  • <SAMPLE_DIR>: the base directory of the sample.
  • <UPLOAD_FILES>…: a list of files to consider for upload

For an example, see the default script prepare_upload_symlinks.sh, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.

Type: string Default: ""

Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.

\ No newline at end of file From 24e1afeaaf4eeb73d2376286257d0645e676bc7b Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 7 Jun 2024 19:30:51 +0200 Subject: [PATCH 14/26] Documentation touch ups - update paper: V-pipe 3.0 pre-print - WorkflowHub's required documentation.md points to our documentation - Update Tutorial intro page: point to installer --- README.md | 6 +++--- docs/README.md | 14 +++++++++++--- workflow/documentation.md | 20 ++++++++++++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 workflow/documentation.md diff --git a/README.md b/README.md index 4eb3cbac..e961012c 100644 --- a/README.md +++ b/README.md @@ -162,9 +162,9 @@ Other dependencies are managed by using isolated conda environments per rule, an If you use this software in your research, please cite: -Posada-Céspedes S., Seifert D., Topolsky I., Jablonski K.P., Metzner K.J., and Beerenwinkel N. 2021. -"V-pipe: a computational pipeline for assessing viral genetic diversity from high-throughput sequencing data." -_Bioinformatics_, January. doi:[10.1093/bioinformatics/btab015](https://doi.org/10.1093/bioinformatics/btab015). +Fuhrmann, L., Jablonski, K. P., Topolsky, I., Batavia, A. A., Borgsmueller, N., Icer Baykal, P., Carrara, M. ... & Beerenwinkel, (2023). +"V-Pipe 3.0: A Sustainable Pipeline for Within-Sample Viral Genetic Diversity Estimation." +_bioRxiv_, doi:[10.1101/2023.10.16.562462](https://doi.org/10.1101/2023.10.16.562462). ## Contributions diff --git a/docs/README.md b/docs/README.md index bc4d9196..e8a82d43 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,9 +1,17 @@ # Tutorials -You can find two tutorials in this directory: +We strongly advise our users to start discovering V-pipe by looking at the tutorials -- [tutorial_hiv.md](tutorial_hiv.md): uses HIV test data -- [tutorial_sarscov2.md](tutorial_sarscov2.md): uses SARS-CoV-2 data from a publication +You can find several tutorials in this directory: + +## Getting V-pipe installed + +- [V-pipe Installation](https://github.com/cbg-ethz/V-pipe/blob/master/docs/tutorial_0_install.md) + +## Viruses + +- [V-Pipe HIV Tutorial](https://github.com/cbg-ethz/V-pipe/blob/master/docs/tutorial_hiv.md): uses HIV test data +- [SARS-CoV-2 Tutorial](https://github.com/cbg-ethz/V-pipe/blob/master/docs/tutorial_sarscov2.md): uses SARS-CoV-2 data from a publication ## Note about the tutorials diff --git a/workflow/documentation.md b/workflow/documentation.md new file mode 100644 index 00000000..fbf4523a --- /dev/null +++ b/workflow/documentation.md @@ -0,0 +1,20 @@ +# Documentation + +As specified in the [usage section](../README.md#usage) of the main README file, these are the steps you need to perform to use V-pipe. + +To configure V-pipe refer to the documentation present in [config/README.md](../config/README.md). + +V-pipe expects the input samples to be organized in a [two-level](../config/README.md#samples) directory hierarchy, and the sequencing reads must be provided in a sub-folder named `raw_data`. +Check the utils subdirectory for [mass-importers tools](../utils/README.md#samples-mass-importers) that can assist you in generating this hierarchy. + +We provide [virus-specific base configuration files](../config/README.md#virus-base-config) which contain handy defaults for, e.g., HIV and SARS-CoV-2. Set the virus in the general section of the configuration file: +```yaml +general: + virus_base_config: hiv +``` + +## Tutorials + +If you want to test your new V-pipe installation, we strongly encourage you to check our tutorials which provide real example data. + +Tutorials for your first steps with V-pipe for different scenarios are available in the [docs/](../docs/README.md) subdirectory. From bb4af162ca2e100b03068e2b266f8df4ff85ce5a Mon Sep 17 00:00:00 2001 From: Prajwal Kulkarni Date: Mon, 29 Apr 2024 10:35:20 +0000 Subject: [PATCH 15/26] improve tutorial_0_install.md --- docs/README.md | 6 +++--- docs/tutorial_0_install.md | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/README.md b/docs/README.md index e8a82d43..bed52a0d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -15,16 +15,16 @@ You can find several tutorials in this directory: ## Note about the tutorials -Due to automated texting, each copy-pastable block begins with a command entering the directory and ends with on leaving the directory: +Due to automated testing, each copy-pastable block begins with a command entering the directory and ends with one leaving the directory: ```bash cd tutorial/work/ # do something cd ../.. ``` -Of course you don't necessarily need to do that. You can simply remain in the working directory. +Of course, you don't necessarily need to do that. You can simply remain in the working directory. -When editing files like `config.yaml`, you can use your favorite editor (`vim`, `emacs`, `nano`, [butterflies](https://xkcd.com/378/), etc.). By default our tutorials use a [_heredoc_](https://en.wikipedia.org/wiki/Here_document) to make it easier to copy-paste the blocks into bash: +When editing files like `config.yaml`, you can use your favorite editor (`vim`, `emacs`, `nano`, [butterflies](https://xkcd.com/378/), etc.). By default, our tutorials use a [_heredoc_](https://en.wikipedia.org/wiki/Here_document) to make it easier to copy-paste the blocks into bash: ```bash cat > config.yaml < # V-Pipe Installation -V-pipe is a workflow designed for the analysis of next generation sequencing (NGS) data from viral pathogens. It produces a number of results in a curated format (e.g., consensus sequences, SNV calls, local/global haplotypes). V-pipe is written using the Snakemake workflow management system. +V-pipe is a workflow designed for the analysis of next-generation sequencing (NGS) data from viral pathogens. It produces a number of results in a curated format (e.g., consensus sequences, SNV calls, local/global haplotypes). V-pipe is written using the Snakemake workflow management system. -The present tutorial will show you how to install V-pipe and the dependencies required to start using it - bioconda, conda-froge mamba and snakemake - before continuing with other tutorials and analyse virus data. +The present tutorial will show you how to install V-pipe and the dependencies required to start using it - bioconda, conda-froge mamba and snakemake - before continuing with other tutorials and analysing virus data. ## Requirements @@ -42,8 +42,8 @@ We will organise our software in the following tree structure, which will be reu - `vp-analysis` is the main directory where we will store everything. - `Miniforge3` is the directory where conda will be installed including the dependencies to start using V-pipe. -- `V-pipe` is the directory where V-pipe's own code will be downloaded from GitHub -- finally, each analysis of virus data will be performed into directory like `work…`, which holds the configuration and the sequencing data for that particular analysis. +- `V-pipe` is the directory where V-pipe's code will be downloaded from GitHub +- finally, each analysis of virus data will be performed in a directory like `work…`, which holds the configuration and the sequencing data for that particular analysis. ## Install V-pipe and conda from scratch @@ -52,7 +52,7 @@ V-pipe uses the [Bioconda](https://bioconda.github.io/) bioinformatics software For advanced users: If your are fluent with these tools, see [below](#fluent-users) -In this present short tutorial you will learn how to setup a workflow for the various examples in the analysis tutorials. +In this short tutorial, you will learn how to setup a workflow for the various examples in the analysis tutorials. To deploy V-pipe, you can use the installation script with the following parameters: @@ -69,7 +69,7 @@ bash quick_install.sh -p vp-analysis -w work If you get `zsh: permission denied: ./quick_install.sh`, run `chmod +x quick_install.sh` this gives the necessary permissions. -**Tip:** To create and populate other new working directories, you can call init_project.sh from within the new directory: +**Tip:** To create and populate other new working directories, you can call `init_project.sh` from within the new directory: ```bash cd vp-analysis/ @@ -85,7 +85,7 @@ cd - ### Analyse data -Now that you have setup the software necessary to start using V-pipe, you can follow with one of the tutorials showing you analysis of viral sequencing data: +Now that you have setup the software necessary to start using V-pipe, you can follow with one of the tutorials showing you the analysis of viral sequencing data: - [tutorial_hiv.md](tutorial_hiv.md): uses HIV test data - [tutorial_sarscov2.md](tutorial_sarscov2.md): uses SARS-CoV-2 data from a publication From d840e801a2477922aa7e3382928fff1f67671c3a Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 7 Jun 2024 19:50:59 +0200 Subject: [PATCH 16/26] Links to bioconda in installation tutorial --- docs/tutorial_0_install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial_0_install.md b/docs/tutorial_0_install.md index db564a8a..f62061f4 100644 --- a/docs/tutorial_0_install.md +++ b/docs/tutorial_0_install.md @@ -95,7 +95,7 @@ Now that you have setup the software necessary to start using V-pipe, you can fo For advanced users: If your are fluent with these tools, you can: -* directly download and install [bioconda](https://bioconda.github.io/user/install.html) and [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda), +* directly download and install [Miniforge3](https://github.com/conda-forge/miniforge#Download), setup [bioconda](https://bioconda.github.io/index.html#usage) and install [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda), * specifiy your V-pipe configuration, and start using V-pipe Use `--use-conda` to [automatically download and install](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) any further pipeline dependencies. Please refer to the documentation for additional instructions. From b349a5157581fcd6026d45dd2b511eafd7a87e71 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 7 Jun 2024 21:27:03 +0200 Subject: [PATCH 17/26] Singularity+conda support --- .snakemake-workflow-catalog.yml | 2 +- workflow/Snakefile | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.snakemake-workflow-catalog.yml b/.snakemake-workflow-catalog.yml index 0b5f012b..acc7d8e1 100644 --- a/.snakemake-workflow-catalog.yml +++ b/.snakemake-workflow-catalog.yml @@ -2,5 +2,5 @@ usage: software-stack-deployment: conda: true singularity: false - singularity+conda: false + singularity+conda: true report: false diff --git a/workflow/Snakefile b/workflow/Snakefile index 119ee07a..2bd8af3d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,3 +1,6 @@ +container: "docker://snakemake/snakemake:v7.32.4" + + import logging LOGGER = logging.getLogger("snakemake.logging") From 3f3292e0a8ea81fa606d006def615687e460e1ca Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Mon, 10 Jun 2024 14:42:01 +0200 Subject: [PATCH 18/26] Update README.md with viloca reference --- .../benchmark/resources/local_haplotype_setup/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md index 01461f08..85ae7ec1 100644 --- a/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md +++ b/resources/auxiliary_workflows/benchmark/resources/local_haplotype_setup/README.md @@ -1,6 +1,6 @@ # Local haplotype reconstruction benchmark -This repository stores the scripts and notebooks used to conduct the benchmark study presented in the manuscript: XXX +This repository stores the scripts and notebooks used to conduct the benchmark study presented in the manuscript: https://www.biorxiv.org/content/10.1101/2024.06.06.597712v1 To reproduce the benchmark study and create the figures presented in the manuscript, use the following instructions: 1. Clone the repository of V-pipe 3.0 into your working directory: From e8360b83e912f10cae562e0a371d1acd5a3caa7b Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Mon, 10 Jun 2024 14:53:08 +0200 Subject: [PATCH 19/26] Add issue templates --- .github/ISSUE_TEMPLATE/01_report_bug.md | 28 ++++++++++++++++++++ .github/ISSUE_TEMPLATE/02_request_feature.md | 17 ++++++++++++ .github/ISSUE_TEMPLATE/03_ask.md | 5 ++++ .github/ISSUE_TEMPLATE/04_discuss.md | 5 ++++ .github/ISSUE_TEMPLATE/config.yml | 1 + 5 files changed, 56 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/01_report_bug.md create mode 100644 .github/ISSUE_TEMPLATE/02_request_feature.md create mode 100644 .github/ISSUE_TEMPLATE/03_ask.md create mode 100644 .github/ISSUE_TEMPLATE/04_discuss.md create mode 100644 .github/ISSUE_TEMPLATE/config.yml diff --git a/.github/ISSUE_TEMPLATE/01_report_bug.md b/.github/ISSUE_TEMPLATE/01_report_bug.md new file mode 100644 index 00000000..3e9e9522 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/01_report_bug.md @@ -0,0 +1,28 @@ +--- +name: 🐛 Report a bug +about: Tell us if something is broken or needs attention +labels: t:bug, help wanted, good first issue, needs triage +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior, e.g.: +1. V-pipe configuration file used '...' +2. Samples TSV file used '....' +3. Commands executed '.vpipe --core 4 ...' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Desktop (please complete the following information):** + - OS: [e.g. Linux, Mac OS] + - Version [e.g. v3.0.0, master branch] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/02_request_feature.md b/.github/ISSUE_TEMPLATE/02_request_feature.md new file mode 100644 index 00000000..2bcedb8f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/02_request_feature.md @@ -0,0 +1,17 @@ +--- +name: 🙋 Request a feature +about: Suggest an improvement, tell about your idea +labels: t:feat, help wanted, good first issue, needs triage +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when ... + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/03_ask.md b/.github/ISSUE_TEMPLATE/03_ask.md new file mode 100644 index 00000000..3dceabc5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/03_ask.md @@ -0,0 +1,5 @@ +--- +name: ❓ Ask +about: Ask a question about this project +labels: t:ask, needs triage +--- diff --git a/.github/ISSUE_TEMPLATE/04_discuss.md b/.github/ISSUE_TEMPLATE/04_discuss.md new file mode 100644 index 00000000..ec714d61 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/04_discuss.md @@ -0,0 +1,5 @@ +--- +name: 💬 Discuss +about: Talk to the team +labels: t:talk, needs triage +--- diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..3ba13e0c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false From 1f589adeae83a8ef4ced753bd7803aaae4556e5d Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Mon, 10 Jun 2024 15:11:58 +0200 Subject: [PATCH 20/26] Contributing guidelines --- CONTRIBUTING.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..367aad01 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,57 @@ + +# Contributing to V-pipe + +A big welcome and thank you for considering contributing to V-pipe! It’s people like you that make it a reality for users in our community. + +Reading and following these guidelines will help us make the contribution process easy and effective for everyone involved. It also communicates that you agree to respect the time of the developers managing and developing these open source projects. In return, we will reciprocate that respect by addressing your issue, assessing changes, and helping you finalize your pull requests. + +## Quicklinks + +* [Getting Started](#getting-started) + * [Issues](#issues) + * [Pull Requests](#pull-requests) +* [Getting Help](#getting-help) + +## Getting Started + +Contributions are made to this repo via Issues and Pull Requests (PRs). A few general guidelines that cover both: + +- Search for existing Issues and PRs before creating your own. +- We work hard to makes sure issues are handled in a timely manner but, depending on the impact, it could take a while to investigate the root cause. A friendly ping in the comment thread to the submitter or a contributor can help draw attention if your issue is blocking. + +### Issues + +Issues should be used to report problems with the V-pipe workflow, request a new feature, or to discuss potential changes before a PR is created. When you create a new Issue, a template will be loaded that will guide you through collecting and providing the information we need to investigate. + +If you find an Issue that addresses the problem you're having, please add your own reproduction information to the existing issue rather than creating a new one. Adding a [reaction](https://github.blog/2016-03-10-add-reactions-to-pull-requests-issues-and-comments/) can also help be indicating to our maintainers that a particular problem is affecting more than just the reporter. + +### Pull Requests + +PRs to our workflow are always welcome and can be a quick way to get your fix or improvement slated for the next release. In general, PRs should: + +- Target our staging branch: [rubicon](https://github.com/cbg-ethz/V-pipe/tree/rubicon) +- Only fix/add the functionality in question **OR** address wide-spread whitespace/style issues, not both. +- Add unit or integration tests for fixed or changed functionality (if a test suite already exists). + - Or at least provide a minimalist example dataset +- Address a single concern in the least number of changed lines as possible. +- Include documentation in the repo or on our `docs/` directory. + +For changes that address core functionality or would require breaking changes (e.g. a major release), it's best to open an Issue to discuss your proposal first. This is not required but can save time creating and reviewing changes. + +In general, we follow the ["fork-and-pull" Git workflow](https://github.com/susam/gitpr) + +1. Fork the repository to your own Github account +2. Clone the project to your machine +3. Create a branch locally with a succinct but descriptive name +4. Commit changes to the branch +5. Following any formatting and testing guidelines specific to this repo + - We rely on [snakefmt](https://github.com/snakemake/snakefmt) for Snakemake files + - We use [Mega-Linter](https://megalinter.io) for the remaining files (Python (Black), Jupyter (Jupyfmt), Markdown (Markdownlint), Bash (Shellcheck), Perl (Perlcritic), Docker (Hadolint)) + - Ask us for help if you have trouble linting your code +6. Push changes to your fork +7. Open a PR in our repository and follow the PR template so that we can efficiently review the changes. + +## Getting Help + +Join us in the [V-pipe Gitter channel](https://gitter.im/V-pipe/community) (also [accessible over matrix](https://matrix.to/#/#V-pipe_community:gitter.im?utm_source=gitter) from your favorite client) and post your question there to reach out the devs. +For further inquiries, you can also contact the V-pipe Dev Team by opening a ticket at [v-pipe@bsse.ethz.ch](mailto:v-pipe@bsse.ethz.ch). From 7013890976e7e8a719ec369bfffef8ddb2963e14 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Mon, 10 Jun 2024 15:59:52 +0200 Subject: [PATCH 21/26] Disable linter for templates --- .mega-linter.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mega-linter.yml b/.mega-linter.yml index f76eb615..1de43e4e 100644 --- a/.mega-linter.yml +++ b/.mega-linter.yml @@ -20,3 +20,5 @@ SHOW_ELAPSED_TIME: true DEFAULT_BRANCH: master # any further fix to LICENSE.md will break licensee's detection scheme MARKDOWN_MARKDOWNLINT_FILTER_REGEX_EXCLUDE: LICENSE\.md +# GitHub passes HTML comments verbatime from template so we can't use +FILTER_REGEX_EXCLUDE: (\.github/ISSUE_TEMPLATE/.*\.md) From 9be30daea00e72abab447b2dafdfea224388cab3 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 14 Jun 2024 12:14:13 +0200 Subject: [PATCH 22/26] bugfix: viloca - typo fix - all: depends on correct CSV --- workflow/rules/common.smk | 11 ++++++++++- workflow/rules/snv.smk | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d0b34108..c6724d88 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -685,11 +685,20 @@ for srec in sample_list: # in adition to standard VCF files, ShoRAH2 also produces CSV tables if config.general["snv_caller"] == "shorah": results.append(os.path.join(sdir, "variants/SNVs/snvs.csv")) + elif config.general["snv_caller"] == "viloca": + results.append( + os.path.join(sdir, "variants/SNVs/snv/cooccurring_mutations.csv") + ) # all snv callers ('shorah', 'lofreq') produce standard VCF files results.append(os.path.join(sdir, "variants/SNVs/snvs.vcf")) # local haplotypes if config.output["local"]: - results.append(os.path.join(sdir, "variants/SNVs/snvs.csv")) + if config.general["snv_caller"] == "shorah": + results.append(os.path.join(sdir, "variants/SNVs/snvs.csv")) + elif config.general["snv_caller"] == "viloca": + results.append( + os.path.join(sdir, "variants/SNVs/snv/cooccurring_mutations.csv") + ) # global haplotypes if config.output["global"]: if config.general["haplotype_reconstruction"] == "savage": diff --git a/workflow/rules/snv.smk b/workflow/rules/snv.smk index ca00df27..2c94be8f 100644 --- a/workflow/rules/snv.smk +++ b/workflow/rules/snv.smk @@ -377,7 +377,7 @@ rule viloca: # Get absolute path for input files CWD=${{PWD}} - WORK_DIR="$(realpath -m {ouput.WORK_DIR})" + WORK_DIR="$(realpath -m {output.WORK_DIR})" BAM="$(realpath {input.BAM})" REF="$(realpath {input.REF})" OUTFILE="$(realpath -m {log.outfile})" From 5befca46b0a8a5f57260731f163e84f2d4ce4f63 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 28 Jun 2024 23:05:03 +0200 Subject: [PATCH 23/26] bugfix: bump versions of bcftools and cyvcf2 - recent change of numpy in bioconda breaks older cyvcf --- workflow/envs/bcftools.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml index 4883ab8d..10c494c2 100644 --- a/workflow/envs/bcftools.yaml +++ b/workflow/envs/bcftools.yaml @@ -2,5 +2,5 @@ channels: - conda-forge - bioconda dependencies: - - bcftools = 1.13 - - cyvcf2 = 0.30.11 + - bcftools = 1.20 + - cyvcf2 = 0.31.0 From b29c0a5464bf452377259959139dd843629d77c2 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 5 Jul 2024 19:11:54 +0200 Subject: [PATCH 24/26] [bump] LolliPop 0.4.0 - support new `filters` option --- workflow/envs/lollipop.yaml | 2 +- workflow/rules/signatures.smk | 5 ++++- workflow/schemas/config_schema.json | 6 ++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/workflow/envs/lollipop.yaml b/workflow/envs/lollipop.yaml index 7f42ad1b..3de87525 100644 --- a/workflow/envs/lollipop.yaml +++ b/workflow/envs/lollipop.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - lollipop =0.3.0 + - lollipop =0.4.0 diff --git a/workflow/rules/signatures.smk b/workflow/rules/signatures.smk index 78b03a16..98ad814e 100644 --- a/workflow/rules/signatures.smk +++ b/workflow/rules/signatures.smk @@ -320,6 +320,9 @@ rule deconvolution: if config.deconvolution["variants_dates"] else [] ), + filters=( + config.deconvolution["filters"] if config.deconvolution["filters"] else [] + ), output: deconvoluted=cohortdir("deconvoluted.tsv.zst"), deconv_json=cohortdir("deconvoluted_upload.json"), @@ -343,7 +346,7 @@ rule deconvolution: threads: config.deconvolution["threads"] shell: """ - {params.LOLLIPOP} deconvolute "--output={output.deconvoluted}" "--out-json={output.deconv_json}" "--var={input.var_conf}" "--vd={input.var_dates}" "--dec={input.deconv_conf}" {params.out_format} {params.seed} "{input.tallymut}" 2> >(tee -a {log.errfile} >&2) > >(tee -a {log.outfile}) + {params.LOLLIPOP} deconvolute "--output={output.deconvoluted}" "--out-json={output.deconv_json}" "--var={input.var_conf}" "--vd={input.var_dates}" "--dec={input.deconv_conf}" "--filters={input.filters}" {params.out_format} {params.seed} "{input.tallymut}" 2> >(tee -a {log.errfile} >&2) > >(tee -a {log.outfile}) """ diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 797c9a3b..fed024ea 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -1661,6 +1661,12 @@ "default": "lines", "description": "Format of the output CSV.\n- `lines`(default) - each variants a separate entry on a separate line.\n- `columns` - one column per variant", "examples": ["columns"] + }, + "filters": { + "type": "string", + "default": "", + "description": "List of filters for removing problematic mutations from tally. Some mutations might be problematic and need to be taken out -- e.g. due to drop-outs in the multiplex PCR amplification, they do not show up in the data and this could be misinterpreted by LolliPop as proof of absence of a variant.", + "examples": ["filters_preprint.yaml"] } }, "default": {}, From 62df0408e4a88b45447c8799a3de3d8b0de3e04a Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 5 Jul 2024 19:33:29 +0200 Subject: [PATCH 25/26] Update of conf manual --- config/config.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config/config.html b/config/config.html index d3b31209..e0dc6e21 100644 --- a/config/config.html +++ b/config/config.html @@ -177,6 +177,7 @@

Type: string Default: ""

Variants configuration used during deconvolution


Example:

var_conf.yaml
 

Type: string Default: ""

Variants to scan per periods (as determined with COJAC by leveraging the output of the cooc rule)


Example:

var_dates.yaml
 

Type: enum (of string) Default: "lines"

Format of the output CSV.

  • lines(default) - each variants a separate entry on a separate line.
  • columns - one column per variant

Must be one of:

  • "lines"
  • "columns"

Example:

columns
+

Type: string Default: ""

List of filters for removing problematic mutations from tally. Some mutations might be problematic and need to be taken out – e.g. due to drop-outs in the multiplex PCR amplification, they do not show up in the data and this could be misinterpreted by LolliPop as proof of absence of a variant.


Example:

filters_preprint.yaml
 

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/visualization.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/diversity_measures.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/dehuman.yaml"

Type: integer Default: 4096

Type: integer Default: 235

Type: integer Default: 4

Type: string Default: "references/human.fa.gz"

Host’s genome used to remove reads (e.g. human genome)

Note: if this file is absent, it is possible to fetch it from a remote server, see property ref_host_url below.


Example:

/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa
 

Type: string Default: "http://ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"

If the host’s genome specified in property ref_host isn’t present, fetch it from a remote server.

Note remember to set aside enough memory for the indexing rule, see section ref_bwa_index property mem.


Examples:

http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
 
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz
@@ -187,4 +188,4 @@
 

Type: boolean Default: false

Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous).


Example:

True
 

Type: boolean Default: false

Also include the original .fastq.gz sequencing reads files from raw_data/ in the list of files to be uploaded. See property orig_cram below for a compressed version and see output dehumanized_raw_reads and section dehuman for depleting reads from the host.


Example:

True
 

Type: boolean Default: false

Also include a compressed version of the original sequencing raw reads files from raw_data/. Similar to property orig_fastq above, but with reference-based compression.


Example:

True
-

Type: string Default: "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh"

Custom script that assists and prepares uploads.

It will receive the following positional parameters:

  • <OUTPUT>: the output file that must be created by the script.
  • <SAMPLE_ID>: a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.
  • <SAMPLE_DIR>: the base directory of the sample.
  • <UPLOAD_FILES>…: a list of files to consider for upload

For an example, see the default script prepare_upload_symlinks.sh, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.

Type: string Default: ""

Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.

\ No newline at end of file +

Type: string Default: "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh"

Custom script that assists and prepares uploads.

It will receive the following positional parameters:

  • <OUTPUT>: the output file that must be created by the script.
  • <SAMPLE_ID>: a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.
  • <SAMPLE_DIR>: the base directory of the sample.
  • <UPLOAD_FILES>…: a list of files to consider for upload

For an example, see the default script prepare_upload_symlinks.sh, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.

Type: string Default: ""

Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.

\ No newline at end of file From 70376758b506122d7c476925e196e982c32d735f Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Sat, 6 Jul 2024 14:30:23 +0200 Subject: [PATCH 26/26] [bump] LolliPop v0.4.1 --- workflow/envs/lollipop.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/lollipop.yaml b/workflow/envs/lollipop.yaml index 3de87525..a42daa39 100644 --- a/workflow/envs/lollipop.yaml +++ b/workflow/envs/lollipop.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - lollipop =0.4.0 + - lollipop =0.4.1