diff --git a/.gitignore b/.gitignore index bc5835048..79bdf212d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ __pycache__/ .Python env/ build/ +vendor/ develop-eggs/ dist/ downloads/ diff --git a/.mega-linter.yml b/.mega-linter.yml index 8959909ee..edf1fbaf9 100644 --- a/.mega-linter.yml +++ b/.mega-linter.yml @@ -9,6 +9,8 @@ ENABLE_LINTERS: - SNAKEMAKE_SNAKEFMT - MARKDOWN_MARKDOWNLINT - JUPYTER_JUPYFMT + - PERL_PERLCRITIC + - DOCKERFILE_HADOLINT VALIDATE_ALL_CODEBASE: true FORMATTERS_DISABLE_ERRORS: false diff --git a/Dockerfile b/Dockerfile index 7b382cf29..c6cdaac56 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,7 @@ ARG vpipe_path ARG envs_path ARG test_data +# hadolint ignore=DL3008 RUN apt-get update && apt-get install -y --no-install-recommends \ jdupes @@ -42,11 +43,11 @@ COPY tests/data ${test_data} WORKDIR /work # configuration: activate all steps -RUN mkdir config -RUN echo 'output:\n snv: true\n local: true\n global: true\n visualization: true\n diversity: true\n QA: true' > config/config.yaml +RUN mkdir config \ + && printf 'output:\n snv: true\n local: true\n global: true\n visualization: true\n diversity: true\n QA: true\n upload: true\nupload:\n orig_cram: true' > config/config.yaml # TODO harmonize list with CI tests and Docker tests -RUN for virus in ${virus_download_list:-$(ls ${test_data}/)}; do echo "\n\n\e[36;1mvirus: ${virus}\e[0m\n" \ +RUN for virus in ${virus_download_list:-$(ls ${test_data}/)}; do printf '\n\n\e[36;1mvirus: %s\e[0m\n' "${virus}" \ && ln -sf "${test_data}/${virus}/" ./samples \ && if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi \ && PYTHONUNBUFFERED=1 snakemake -s ${vpipe_path}/workflow/Snakefile -j 1 --conda-create-envs-only --use-conda --conda-prefix ${envs_path} --config "general={virus_base_config: ${virus}}" \ @@ -62,6 +63,7 @@ FROM snakemake/snakemake:${snaketag} AS vpipe-tests-base ARG install_path # NOTE rsync only used with local scratch +# hadolint ignore=DL3008 RUN apt-get update && apt-get install -y --no-install-recommends \ rsync \ && apt-get clean \ @@ -83,8 +85,8 @@ ARG test_data ENV virus=hiv WORKDIR /work -RUN mkdir config -RUN echo 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true' > config/config.yaml +RUN mkdir config \ + && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true\nupload:\n orig_cram: true' > config/config.yaml COPY --from=create-envs ${test_data}/${virus} ./samples RUN if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi # NOTE see top comment if `--network=none` breaks build process @@ -105,8 +107,8 @@ ARG test_data ENV virus=sars-cov-2 WORKDIR /work -RUN mkdir config -RUN echo 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true' > config/config.yaml +RUN mkdir config \ + && printf 'output:\n snv: true\n local: true\n global: false\n visualization: true\n diversity: true\n QA: true\n upload: true\nupload:\n orig_cram: true' > config/config.yaml COPY --from=create-envs ${test_data}/${virus} ./samples RUN if test -e samples/samples.tsv; then cp -f samples/samples.tsv ./config/samples.tsv; fi # NOTE see top comment if `--network=none` breaks build process @@ -145,6 +147,7 @@ ARG install_path ARG vpipe_path ARG envs_path +# hadolint ignore=DL3008 RUN apt-get update && apt-get install -y --no-install-recommends \ rsync \ && apt-get clean \ @@ -153,7 +156,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY --from=vpipe-final-base ${install_path} ${install_path} # ============================================= -MAINTAINER V-pipe Dev Team +LABEL maintainer="V-pipe Dev Team " VOLUME /work WORKDIR /work diff --git a/config/config.html b/config/config.html index 0f98b4d57..ddb2e4864 100644 --- a/config/config.html +++ b/config/config.html @@ -1,10 +1,10 @@ V-pipe configuration

V-pipe configuration

Type: object

The V-pipe workflow can be customized through the configuration file config.yaml or config.json or, for backward compatibility with the legacy INI-style format used in V-pipe v1.x/2.x, vpipe.config. This configuration file is a text file written using a basic structure composed of sections, properties and values. When using YAML or JSON format use these languages associative array/dictionaries in two levels for sections and properties. When using the older INI format, sections are expected in squared brackets, and properties are followed by corresponding values.

Further more, it is possible to specify additional options on the command line using Snakemake’s --configfile to pass additional YAML/JSON configuration files, and/or using Snakemake’s --config to pass sections and properties in a YAML Flow style/JSON syntax.

The order of precedence is:
command line options (--config, --configfile) >> default configuration file (config/config.yaml or config.yaml) >> legacy configuration INI (vpipe.config) >> Virus-specific base config (virus_based_config) >> default values

Example: For instance, we suggest providing as input a tabular file specifying sample unique identifiers (e.g., patient identifiers), and dates for different sequencing runs related to the same patient. The name of this file (here, samples.tsv) can be provided by specifying the section as input and the property as samples_file, as follows in the example below.

In this document, we provide a comprehensive list of all user-configurable options stratified by sections.


Example:

input:
   samples_file: samples.tsv
-

Type: object Default: {}

This section of the configuration provides general options that control the overall behavior of the pipeline

Type: string Default: ""

We provide virus-specific base configuration files which contain handy defaults for, e.g., HIV and SARS-CoV-2. Check the git repository’s config subdirectory to learn about them.


Examples:

hiv
+

Type: object Default: {}

This section of the configuration provides general options that control the overall behavior of the pipeline.

Type: string Default: ""

We provide virus-specific base configuration files which contain handy defaults for, e.g., HIV and SARS-CoV-2. Check the git repository’s config subdirectory to learn about them.


Examples:

hiv
 ...
 
sars-cov-2
 ...
-

Type: enum (of string) Default: "ngshmmalign"

There are three options for mapping reads, either using ngshmmalign, BWA MEM (bwa) 1, or Bowtie 2 (bowtie) 2. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.

Note: Some virus-specific base configuration specified in virus_base_config might change this option’s default to a more appropriate aligner for that virus, e.g, depending on its usual diversity and mutation rate.
You are still free to override that default in your configuration shall the need arise.


  1. Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013. 

  2. Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012. 

Must be one of:

  • "ngshmmalign"
  • "bwa"
  • "bowtie"

Example:

bowtie
+

Type: enum (of string) Default: "ngshmmalign"

There are three options for mapping reads, either using ngshmmalign, BWA MEM (bwa) 1, or Bowtie 2 (bowtie) 2. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.

Note: Some virus-specific base configuration specified in virus_base_config might change this option’s default to a more appropriate aligner for that virus, e.g., depending on its usual diversity and mutation rate.
You are still free to override that default in your configuration shall the need arise.


  1. Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013. 

  2. Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012. 

Must be one of:

  • "ngshmmalign"
  • "bwa"
  • "bowtie"

Example:

bowtie
 ...
 

Type: enum (of string) Default: "shorah"

There are two options available for calling single nucleotide variants, either using ShoRAH (shorah) 1 or LoFreq (lofreq) 2. ShoRAH is used by default. If you prefer to use LoFreq, then indicate so in the configuration file as in the example


  1. Zagordi, O. et al. ShoRAH: estimating the genetic diversity of a mixed sample from next-generation sequencing data. BMC Bioinformatics. 2011. 

  2. Wilm, A. et al. LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Res. 2012. 

Must be one of:

  • "shorah"
  • "lofreq"

Example:

lofreq
 ...
@@ -12,11 +12,13 @@
 ...
 

Type: integer Default: 1

This option should be used to specify the default number of threads for all multi-threaded rules. That is, unless the number of threads is specified for each rule, this value is set as default.


Example:

4
 ...
+

Type: enum (of string) Default: "md5"

Sets the algorithm to be used when computing checksums for uploadable data.

Must be one of:

  • "md5"
  • "sha1"
  • "sha256"
  • "sha224"
  • "sha384"
  • "sha512"
  • "xxh64"
  • "xxh32"
  • "xxh128"

Example:

sha256
+...
 

Type: string Default: ""

Some step of V-pipe produce temporary files such as, e.g., decompressed intermediate - i.e. files which aren’t kept long-term but are deleted after all steps that needed them have finished. By default, these files are written in the output data directory. This option, makes it is possible to write them in a different directory instead. Use this option to, e.g., leverage a faster cluster-local storage or avoid wasting backup space on a snapshotted storage. You might want to consult the documentation provided by your HPC.


Examples:

temp
 ...
 
/cluster/scratch
 ...
-

Type: object Default: {}

Properties in this section of the configuration control the input of the pipeline

Type: string Default: "samples/"

The input file for the workflow will be searched in this directory.

V-pipe expects the input samples to be organized in a two-level directory hierarchy.

  • The first level can be, e.g., patient samples or biological replicates of an experiment.
  • The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.
  • Inside that directory, the sub-directory raw_data holds the sequencing data in FASTQ format (optionally compressed with GZip).

For example:

samples
+

Type: object Default: {}

Properties in this section of the configuration control the input of the pipeline.

Type: string Default: "samples/"

The input file for the workflow will be searched in this directory.

V-pipe expects the input samples to be organized in a two-level directory hierarchy.

  • The first level can be, e.g., patient samples or biological replicates of an experiment.
  • The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.
  • Inside that directory, the sub-directory raw_data holds the sequencing data in FASTQ format (optionally compressed with GZip).

For example:

samples
 ├── patient1
 │   ├── 20100113
 │   │   └──raw_data
@@ -47,7 +49,7 @@
 

Type: string Default: "config/samples.tsv"

File containing sample unique identifiers and dates as tab-separated values, e.g.,

patient1    20100113
 patient1    20110202
 patient2    20081130
-

Here, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.

Optionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.

Standardized Snakemake workflows place their tables inside the config/ subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x)


Example:

samples.tsv
+

Here, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.

Optionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.

Standardized Snakemake workflows place their tables inside the config/ subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x).


Example:

samples.tsv
 ...
 

Type: integer Default: 250

Default for those samples whose read length isn’t specified explicitly in the optional third column of the samples.tsv table.


Example:

'100'
 

Type: number Default: 0.8

Using this parameter, the user can specify the read-length threshold that should be applied during the quality trimming as a percentage (0 < trim_percent_cutoff < 1).


Example:

0.9
@@ -68,7 +70,7 @@
 ...
 

Type: string Default: ""

A FASTQ file with sequences of interest

Note: These sequences are used, together with the consensus sequence, to build a phylogenetic tree.


Example:

resources/sars-cov-2/phylogeny/selected_covid_sequences.fasta
 ...
-

Type: object Default: {}

Properties in this section of the configuration control the output of the pipeline

Type: string Default: "results"

The workflow will write its output files into this directory. This will follow the same structure as for the input.

For each sample, V-pipe produces several output files that are located in the corresponding sample-specific directory. First, the alignment file and consensus sequences are located in the alignments and references subdirectories, respectively. Second, output files containing SNVs and viral haplotypes are located in the variants subdirectories.

Using the sample example as in the input section, the output files for the two patient samples will be located in the following subdirectories:

results
+

Type: object Default: {}

Properties in this section of the configuration control the output of the pipeline.

Type: string Default: "results"

The workflow will write its output files into this directory. This will follow the same structure as for the input.

For each sample, V-pipe produces several output files that are located in the corresponding sample-specific directory. First, the alignment file and consensus sequences are located in the alignments and references subdirectories, respectively. Second, output files containing SNVs and viral haplotypes are located in the variants subdirectories.

Using the sample example as in the input section, the output files for the two patient samples will be located in the following subdirectories:

results
 ├──patient1
 │  ├──20100113
 │  │  ├──alignments
@@ -115,51 +117,73 @@
 │  ├──20100113
 │  │  ├──alignments
 
-

If you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the datadirs, you can use this options you can specify alternate subdirectory relative to the datadir property. (Use .. prefix if you want instead your cohort-wide results to be in a directory at the sample level as samples/ and results/. See the example below to recreate the variants/ directory used by legacy V-pipe v1.x/2.x)


Example:

../variants
+

If you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the datadirs, you can use this options you can specify alternate subdirectory relative to the datadir property. (Use .. prefix if you want instead your cohort-wide results to be in a directory at the sample level as samples/ and results/. See the example below to recreate the variants/ directory used by legacy V-pipe v1.x/2.x).


Example:

../variants
 ...
-

Type: boolean Default: false

V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample’s consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample’s …/references/frameshift_deletions_check.tsv. Such reports can be useful when submitting sequences to GISAID.

This option turns on such QA features


Example:

true
+

Type: boolean Default: false

V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample’s consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample’s …/references/frameshift_deletions_check.tsv. Such reports can be useful when submitting sequences to GISAID.

This option turns on such QA features.


Example:

true
 ...
-

Type: boolean Default: false

This option selects whether the SNV caller step should be executed and its output written to each sample’s …/variants/SNVs/snvs.csv


Example:

true
+

Type: boolean Default: false

This option selects whether the SNV caller step should be executed and its output written to each sample’s …/variants/SNVs/snvs.csv.


Example:

true
 ...
-

Type: boolean Default: false

This option activates local haplotype reconstruction (only available when using ShoRAH)


Example:

true
+

Type: boolean Default: false

This option activates local haplotype reconstruction (only available when using ShoRAH).


Example:

true
 ...
-

Type: boolean Default: false

This option turns on global haplotype reconstruction


Example:

true
+

Type: boolean Default: false

This option turns on global haplotype reconstruction.


Example:

true
 ...
-

Type: boolean Default: false

This option selects whether to generate HTML visualization of the SNVs in each sample’s …/visualization/index.html


Example:

true
+

Type: boolean Default: false

This option selects whether to generate HTML visualization of the SNVs in each sample’s …/visualization/index.html.


Example:

true
 ...
 

Type: boolean Default: false

This option turns on the computation of diversity measures in each sample.


Example:

true
 ...
-

Type: object Default: {}

The path to the different software packages can be specified using this section.

It is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the PATH.

Note we strongly recommend to use conda environments, by adding the --use-conda flag to the V-pipe execution command, e.g. ./vpipe --use-conda. If you prefer to use your own installations, this section allows you to specify the location of the executables


Example:

bwa: /path/to/bwa
+

Type: boolean Default: false

This option turns on dehumanization of the raw reads (i.e. removal of host’s reads) and generates the file dehuman.cram. This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive).

This only applies to the upload and does not affect the main workflow.


Example:

true
+...
+

Type: boolean Default: false

This option can be used for assistance in incremental upload of data. See section upload for an example.


Example:

true
+...
+

Type: object Default: {}

The path to the different software packages can be specified using this section.

It is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the PATH.

Note we strongly recommend to use conda environments, by adding the --use-conda flag to the V-pipe execution command, e.g. ./vpipe --use-conda. If you prefer to use your own installations, this section allows you to specify the location of the executables.


Example:

bwa: /path/to/bwa
 haploclique: /path/to/haploclique
-

Type: string Default: "gunzip"

Type: string Default: "prinseq-lite.pl"

Type: string Default: "fastqc"

Type: string Default: "vicuna"

Due to a special license, VICUNA is not available from bioconda and must be installed from its original website.
Use this option to specify where you have installed its executable

Type: string Default: "InDelFixer"

Type: string Default: "ConsensusFixer"

Type: string Default: "picard"

Type: string Default: "bwa"

Type: string Default: "bowtie2-build"

Type: string Default: "bowtie2"

Type: string Default: "samtools"

Type: string Default: "extract_consensus"

Type: string Default: "matcher"

Type: string Default: "frameshift_deletions_checks"

Type: string Default: "mafft"

Type: string Default: "ngshmmalign"

Type: string Default: "convert_reference"

Type: string Default: "extract_seq"

Type: string Default: "coverage_stats"

Type: string Default: "remove_gaps_msa"

Type: string Default: "aln2basecnt"

Type: string Default: "gather_coverage"

Type: string Default: "minority_freq"

Type: string Default: "extract_coverage_intervals"

Type: string Default: "shorah shotgun"

Type: string Default: "lofreq"

Type: string Default: "bcftools"

Type: string Default: "haploclique"

Type: string Default: "compute_mds"

Type: string Default: "savage"

Type: string Default: "predicthaplo"

Type: object Default: {}

Type: integer Default: 32

Type: integer Default: 60

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 20

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/preprocessing.yaml"

Type: string Default: "-ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10"

We use software PRINSEQ 1 for quality control. By default, we use options -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10, which indicates to trim reads using a sliding window with size 10 bp, and trim bases if their quality scores are less than 30. Additionally, reads are filtered out if the average quality score is below 30 and if they contain more than 4 N’s. The user can choose to overwrite the default settings or use additional parameters by using the property extra. E.g., if many reads are filtered out in this step, the user can choose to lower the quality threshold as indicated in the example.
Please do not modify PRINSEQ options -out_format, -out_good, nor -min_len. Instead of using -min_len to define threshold on the read length after trimming, use input => trim_percent_cutoff.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 


Example:

-ns_max_n 4 -min_qual_mean 20 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_window
+

Type: string Default: "gunzip"

Type: string Default: "prinseq-lite.pl"

Type: string Default: "fastqc"

Type: string Default: "vicuna"

Due to a special license, VICUNA is not available from bioconda and must be installed from its original website.
Use this option to specify where you have installed its executable.

Type: string Default: "InDelFixer"

Type: string Default: "ConsensusFixer"

Type: string Default: "picard"

Type: string Default: "bwa"

Type: string Default: "bowtie2-build"

Type: string Default: "bowtie2"

Type: string Default: "samtools"

Type: string Default: "extract_consensus"

Type: string Default: "matcher"

Type: string Default: "frameshift_deletions_checks"

Type: string Default: "mafft"

Type: string Default: "ngshmmalign"

Type: string Default: "convert_reference"

Type: string Default: "extract_seq"

Type: string Default: "coverage_stats"

Type: string Default: "remove_gaps_msa"

Type: string Default: "aln2basecnt"

Type: string Default: "gather_coverage"

Type: string Default: "minority_freq"

Type: string Default: "extract_coverage_intervals"

Type: string Default: "shorah shotgun"

Type: string Default: "lofreq"

Type: string Default: "bcftools"

Type: string Default: "haploclique"

Type: string Default: "compute_mds"

Type: string Default: "savage"

Type: string Default: "predicthaplo"

Type: object Default: {}

Type: integer Default: 32

Type: integer Default: 60

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 20

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/preprocessing.yaml"

Type: string Default: "-ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10"

We use software PRINSEQ 1 for quality control. By default, we use options -ns_max_n 4 -min_qual_mean 30 -trim_qual_left 30 -trim_qual_right 30 -trim_qual_window 10, which indicates to trim reads using a sliding window with size 10 bp, and trim bases if their quality scores are less than 30. Additionally, reads are filtered out if the average quality score is below 30 and if they contain more than 4 N’s. The user can choose to overwrite the default settings or use additional parameters by using the property extra. E.g., if many reads are filtered out in this step, the user can choose to lower the quality threshold as indicated in the example.
Please do not modify PRINSEQ options -out_format, -out_good, nor -min_len. Instead of using -min_len to define threshold on the read length after trimming, use input => trim_percent_cutoff.


  1. Schmieder, R. and Edwards, R. Quality control and preprocessing of metagenomic datasets. Bioinformatics. 2011. 


Example:

-ns_max_n 4 -min_qual_mean 20 -trim_qual_left 20 -trim_qual_right 20 -trim_qual_window
   10
 ...
-

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/fastqc.yaml"

Type: integer Default: 6

Type: boolean Default: false

Type: object Default: {}

NOTE The conda environment for this rule doesn’t work properly. The package on the bioconda channel, mvicuna, is slightly different from VICUNA and it has different command-line arguments. Moreover, VICUNA and mvicuna are no longer maintained. In the future, this rule will be deprecated.

Type: integer Default: 1000

Type: integer Default: 600

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna.yaml"

Type: object Default: {}

NOTE Obtaining a initial reference de novo is implemented for more than one sample.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna_msa.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/hmm_align.yaml"

Type: boolean Default: false

this option is useful for debugging purposes


Example:

true
+

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/fastqc.yaml"

Type: integer Default: 6

Type: boolean Default: false

Type: object Default: {}

NOTE The conda environment for this rule doesn’t work properly. The package on the bioconda channel, mvicuna, is slightly different from VICUNA and it has different command-line arguments. Moreover, VICUNA and mvicuna are no longer maintained. In the future, this rule will be deprecated.

Type: integer Default: 1000

Type: integer Default: 600

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna.yaml"

Type: object Default: {}

NOTE Obtaining a initial reference de novo is implemented for more than one sample.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/initial_vicuna_msa.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/hmm_align.yaml"

Type: boolean Default: false

This option is useful for debugging purposes.


Example:

true
 ...
-

Type: string Default: ""

pass additional options to run ngshmmalign

V-pipe uses option -R <path/to/initial_reference>, thus option -r arg is not allowed. Also, instead of passing -l via the property extra, set leave_msa_temp to True. Lastly, please do not modify options -o arg, -w arg, -t arg, and -N arg. These are already managed by V-pipe.

Type: object Default: {}

Type: integer Default: 5000

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_QA.yaml"

Type: string Default: ""

Panel of diverse references against which to align reads as a QA step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/hiv/5-Virus-Mix.fasta
+

Type: string Default: ""

Pass additional options to run ngshmmalign

V-pipe uses option -R <path/to/initial_reference>, thus option -r arg is not allowed. Also, instead of passing -l via the property extra, set leave_msa_temp to True. Lastly, please do not modify options -o arg, -w arg, -t arg, and -N arg. These are already managed by V-pipe.

Type: object Default: {}

Type: integer Default: 5000

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_QA.yaml"

Type: string Default: ""

Panel of diverse references against which to align reads as a QA step

Note: The virus-specific base configuration specified in general => virus_base_config will most likely change this option’s default.
You are still free to override that default in your configuration shall the need arise.


Example:

resources/hiv/5-Virus-Mix.fasta
 ...
-

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: "HXB2:6614-6812,7109-7217,7376-7478,7601-7634"

Type: object Default: {}

This rule takes all previously aligned reads by hmm_align. Therefore, resources should be allocated accordingly.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/msa.yaml"

Type: object Default: {}

Type: integer Default: 8000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: enum (of string) Default: "--phred33"

indicate if qualities are Phred+33 (default) or Phred+64 (--phred64)

Must be one of:

  • "--phred33"
  • "--phred64"

Example:

--phred64
+

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: "HXB2:6614-6812,7109-7217,7376-7478,7601-7634"

Type: object Default: {}

This rule takes all previously aligned reads by hmm_align. Therefore, resources should be allocated accordingly.

Type: integer Default: 10000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/msa.yaml"

Type: object Default: {}

Type: integer Default: 8000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bwa_align.yaml"

Type: string Default: ""

With property extra, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bowtie_align.yaml"

Type: enum (of string) Default: "--phred33"

Indicate if qualities are Phred+33 (default) or Phred+64 (--phred64).

Must be one of:

  • "--phred33"
  • "--phred64"

Example:

--phred64
 ...
-

Type: string Default: "--local --sensitive-local"

specify Bowtie 2 presets

Type: integer

Type: string Default: ""

pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options
For more details on Bowtie 2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 50

minimum read depth for reporting variants per locus

Type: integer Default: 5

read count below which ambiguous base ’n’ is reported

Type: integer Default: 15

minimum phred quality score for a base to be included

Type: number Default: 0.05

minimum frequency for an ambiguous nucleotide

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bcftools.yaml"

Type: integer Default: 10000

Type: integer Default: 10

Type: number Default: 0.05

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/consseq_qa.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 100

minimum read depth for reporting variants per locus


Example:

50
+

Type: string Default: "--local --sensitive-local"

Specify Bowtie 2 presets.

Type: integer

Type: string Default: ""

Pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options
For more details on Bowtie 2 configurable options refer to the software documentation.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 50

Minimum read depth for reporting variants per locus.

Type: integer Default: 5

Read count below which ambiguous base ’n’ is reported.

Type: integer Default: 15

Minimum phred quality score for a base to be included.

Type: number Default: 0.05

Minimum frequency for an ambiguous nucleotide.

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/bcftools.yaml"

Type: integer Default: 10000

Type: integer Default: 10

Type: number Default: 0.05

Type: object Default: {}

Type: integer Default: 4096

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/consseq_qa.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1250

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 235

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 100

Minimum read depth for reporting variants per locus.


Example:

50
 ...
-

Type: boolean Default: false

output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples


Example:

true
+

Type: boolean Default: false

Output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples.


Example:

true
 ...
-

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: boolean Default: false

construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth


Example:

true
+

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 30

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: boolean Default: false

Construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth.


Example:

true
 ...
-

Type: integer Default: 50

minimum read depth. A region spanning the reference genome is returned if coverage is set to 0


Example:

0
+

Type: integer Default: 50

Minimum read depth. A region spanning the reference genome is returned if coverage is set to 0.


Example:

0
 ...
-

Type: boolean Default: true

indicate whether to apply a more liberal shifting on intervals’ right-endpoint


Example:

false
+

Type: boolean Default: true

Indicate whether to apply a more liberal shifting on intervals’ right-endpoint.


Example:

false
 ...
-

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 2880

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/snv.yaml"

Type: boolean Default: true

indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False


Example:

false
+

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 2880

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/snv.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

false
 ...
-

Type: number Default: 0.1

hyperparameter used for instantiating a new cluster

Type: boolean Default: false

ignore SNVs adjacent to indels

Type: number Default: 0.9

Type: integer Default: 0

omit windows with coverage less than this value


Example:

50
+

Type: number Default: 0.1

Hyperparameter used for instantiating a new cluster.

Type: boolean Default: false

Ignore SNVs adjacent to indels.

Type: number Default: 0.9

Type: integer Default: 0

Omit windows with coverage less than this value.


Example:

50
 ...
-

Type: integer Default: 3

ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows

Type: boolean Default: false

indicate whether to move files produced in previous/interrupted runs to subdirectory named old


Example:

true
+

Type: integer Default: 3

ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows.

Type: boolean Default: false

Indicate whether to move files produced in previous/interrupted runs to subdirectory named old


Example:

true
 ...
-

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 20

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: boolean Default: true

indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False


Example:

false
+

Type: string Default: ""

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 20

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/lofreq.yaml"

Type: boolean Default: true

Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from minor_variants rule located in the cohort-wide output results/cohort_onsensus.fasta) or the reference sequence by setting this option to False.


Example:

false
 ...
-

Type: string Default: ""

pass additional options to run lofreq call

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 5

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

NOTE This rule only works in Linux.

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/savage.yaml"

Type: integer Default: 20

size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/split < 1000

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: string Default: "{VPIPE_BASEDIR}/envs/haploclique.yaml"

Type: boolean Default: true

if set to True (default) a predefined set of parameter values is used for drawing edges between reads in the read graph

Type: boolean Default: true

singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to True, singletons are discarded

Type: boolean Default: true

if set to True (default) probability of the overhangs is ignored

Type: integer Default: 3

sets a threshold to limit the size of cliques

Type: integer Default: 10000

indicates the maximum number of clique to be considered in the next iteration

Type: string Default: ""

additional parameters to be passed to haploclique.

Warning: this won’t overwrite the other options (e.g. clique_size_limi and max_num_cliques should still be set via their own respective properties, do not pass parameters --limit_clique_size= nor --max_cliques= via this extra property).

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 0

use to specify a region of interest

Type: integer Default: -1

use to specify a region of interest


Examples:

9719
+

Type: string Default: ""

Pass additional options to run lofreq call

Type: object Default: {}

Type: integer Default: 1000

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 5

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/sam2bam.yaml"

Type: object Default: {}

NOTE This rule only works in Linux.

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: string Default: "{VPIPE_BASEDIR}/envs/savage.yaml"

Type: integer Default: 20

Size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/split < 1000.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: string Default: "{VPIPE_BASEDIR}/envs/haploclique.yaml"

Type: boolean Default: true

If set to True (default) a predefined set of parameter values is used for drawing edges between reads in the read graph.

Type: boolean Default: true

Singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to True, singletons are discarded.

Type: boolean Default: true

If set to True (default) probability of the overhangs is ignored.

Type: integer Default: 3

Sets a threshold to limit the size of cliques.

Type: integer Default: 10000

Indicates the maximum number of clique to be considered in the next iteration.

Type: string Default: ""

Additional parameters to be passed to haploclique.

Warning: this won’t overwrite the other options (e.g. clique_size_limi and max_num_cliques should still be set via their own respective properties, do not pass parameters --limit_clique_size= nor --max_cliques= via this extra property).

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/smallgenomeutilities.yaml"

Type: integer Default: 0

Use to specify a region of interest.

Type: integer Default: -1

Use to specify a region of interest.


Examples:

9719
 ...
 
29836
 ...
-

Type: string Default: ""

when the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: integer Default: 0

Type: string Default: "{VPIPE_BASEDIR}/envs/predicthaplo.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/visualization.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/diversity_measures.yaml"
\ No newline at end of file +

Type: string Default: ""

When the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed.

Type: object Default: {}

Type: integer Default: 10000

Type: integer Default: 1435

Type: integer

Type: integer Default: 0

Type: string Default: "{VPIPE_BASEDIR}/envs/predicthaplo.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/visualization.yaml"

Type: object Default: {}

Type: integer Default: 2000

Type: integer Default: 235

Type: string Default: "{VPIPE_BASEDIR}/envs/diversity_measures.yaml"

Type: object Default: {}

Type: string Default: "{VPIPE_BASEDIR}/envs/dehuman.yaml"

Type: integer Default: 4096

Type: integer Default: 235

Type: integer Default: 4

Type: string Default: "references/human.fa.gz"

Host’s genome used to remove reads (e.g. human genome)

Note: if this file is absent, it is possible to fetch it from a remote server, see property ref_host_url below.


Example:

/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa
+...
+

Type: string Default: "http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"

If the host’s genome specified in property ref_host isn’t present, fetch it from a remote server.

Note remember to set aside enough memory for the indexing rule, see section ref_bwa_index property mem.


Example:

https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz
+...
+

Type: boolean Default: false

Indicate whether to store the host-aligned reads in a CRAM file …/alignments/host_aln.cram.


Example:

true
+...
+

Type: boolean Default: false

Use this option when generating dehumanized raw reads (dehuman.cram) on old samples that have already been processed in the past - a catch up.

Normally, removing host-mapping reads requires analyzing reads which were rejected by V-pipe’s main processing (as specified in section general, property aligner). But this output is considered temporary and will get deleted by Snakemake once the processing of a sample has finished. To generate dehuman.cram V-pipe would need to run the aligner again, which will both regenerate the data necessary for this output but also generate a new alignment which will trigger the whole workflow again.
Use this property catchup to only generate the input necessary for dehuman.cram, leaving untouched the alignment and everything else that has already been processed.


Example:

true
+...
+

Type: object Default: {}

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

Type: object Default: {}

This section is used to assist and prepare uploads of the data, e.g. to European Nucleotide Archive. By default it calls a script that creates symlinks making it easy to identify new/updated samples between calls of V-pipe. But by using the properties script and options and adapting the environment provided in property conda, it is possible to heavily customize the actions (e.g. it is possible to upload to an SFTP server by calling sftp from a modified script). For inspiration, see the default script prepare_upload_symlinks.sh.

Type: integer Default: 256

Type: integer Default: 60

Type: string Default: "{VPIPE_BASEDIR}/envs/upload.yaml"

The default environment only provides hashing functions (xxhash, linux coreutils’ sha{nnn}sum collection, etc.) but depending on your needs you would want to provide a custom environment with additional tools (e.g. sftp, rsync, curl, lftp, custom specialized cloud uploaders, etc.)

Type: integer Default: 1

Type: boolean Default: true

Don’t dispatch the rule to the cluster for execution, run locally.


Example:

false
+...
+

Type: enum (of string) Default: "ambig"

When preparing data for upload, specifies which consensus sequence should be uploaded.

Must be one of:

  • "ambig"
  • "majority"

Example:

majority
+...
+

Type: boolean Default: false

Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous).


Example:

true
+...
+

Type: boolean Default: false

Also include the original .fastq.gz sequencing reads files from raw_data/ in the list of files to be uploaded. See property orig_cram below for a compressed version and see output dehumanized_raw_reads and section dehuman for depleting reads from the host.


Example:

true
+...
+

Type: boolean Default: false

Also include a compressed version of the original sequencing raw reads files from raw_data/. Similar to property orig_fastq above, but with reference-based compression.


Example:

true
+...
+

Type: string Default: "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh"

Custom script that assists and prepares uploads.

It will receive the following positional parameters:

  • : the output file that must be created by the script.
  • : a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.
  • : the base directory of the sample.
  • …: a list of files to consider for upload

For an example, see the default script prepare_upload_symlinks.sh, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.

Type: string Default: ""

Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.

\ No newline at end of file diff --git a/tests/regression_tests.sh b/tests/regression_tests.sh index 160427b02..67fa88da3 100755 --- a/tests/regression_tests.sh +++ b/tests/regression_tests.sh @@ -56,6 +56,10 @@ output: visualization: true diversity: true QA: true + upload: true + +upload: + orig_cram: true snv: threads: ${THREADS} diff --git a/workflow/Snakefile b/workflow/Snakefile index f44861368..4ac54a790 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,12 +31,14 @@ rule allfastqc: fastqc_files, -include: "rules/clean.smk" include: "rules/quality_assurance.smk" include: "rules/align.smk" include: "rules/consensus.smk" include: "rules/mafs.smk" include: "rules/stats.smk" +include: "rules/publish.smk" +include: "rules/dehuman.smk" +include: "rules/clean.smk" if config.output["snv"] or config.output["local"]: diff --git a/workflow/envs/dehuman.yaml b/workflow/envs/dehuman.yaml new file mode 100644 index 000000000..5d9ef3120 --- /dev/null +++ b/workflow/envs/dehuman.yaml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bwa=0.7.17 + - samtools>1.11 + - xxhash + - coreutils # [not linux] diff --git a/workflow/envs/upload.yaml b/workflow/envs/upload.yaml new file mode 100644 index 000000000..d3b4db650 --- /dev/null +++ b/workflow/envs/upload.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - xxhash + - coreutils # [not linux] diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 3242ff88c..65a829c2f 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -267,12 +267,7 @@ if config.general["aligner"] == "ngshmmalign": input: "{dataset}/preprocessed_data/{file}.fastq.gz", output: - temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/preprocessed_data/{file}.fastq", - ) - ), + temp_with_prefix("{dataset}/preprocessed_data/{file}.fastq"), params: GUNZIP=config.applications["gunzip"], log: @@ -295,16 +290,8 @@ if config.general["aligner"] == "ngshmmalign": initial_ref="{dataset}/references/initial_consensus.fasta", FASTQ=input_align, output: - good_aln=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/full_aln.sam" - ) - ), - reject_aln=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/rejects.sam" - ) - ), + good_aln=temp_with_prefix("{dataset}/alignments/full_aln.sam"), + reject_aln=temp_with_prefix("{dataset}/alignments/rejects.sam"), REF_ambig="{dataset}/references/ref_ambig.fasta", REF_majority="{dataset}/references/ref_majority.fasta", params: @@ -344,9 +331,10 @@ if config.general["aligner"] == "ngshmmalign": mv {wildcards.dataset}/{{alignments,references}}/ref_ambig.fasta mv {wildcards.dataset}/{{alignments,references}}/ref_majority.fasta """ +# 3. construct MSA from all patient files + -# 3. construct MSA from all patient files def construct_msa_input_files(wildcards): output_list = ["{}{}.fasta".format(s, wildcards.kind) for s in references] output_list.append(reference_file) @@ -419,14 +407,13 @@ if config.general["aligner"] == "ngshmmalign": """ {params.CONVERT_REFERENCE} -t {params.REF_NAME} -m {input.REF_ambig} -i {input.BAM} -o {output} > {log.outfile} 2> >(tee {log.errfile} >&2) """ - - # 2-4. Alternative: align reads using bwa or bowtie + rule sam2bam: input: - os.path.join(config.general["temp_prefix"], "{file}.sam"), + temp_prefix("{file}.sam"), output: # TODO support cram here BAM="{file}.bam", @@ -434,6 +421,7 @@ rule sam2bam: params: SAMTOOLS=config.applications["samtools"], FUNCTIONS=functions, + sort_tmp=temp_prefix("{file}.tmp"), log: outfile="{file}_sam2bam.out.log", errfile="{file}_sam2bam.err.log", @@ -451,67 +439,63 @@ rule sam2bam: shell: """ echo "Writing BAM file" - {params.SAMTOOLS} sort -o "{output.BAM}" "{input}" + {params.SAMTOOLS} sort -T "{params.sort_tmp}" -o "{output.BAM}" "{input}" {params.SAMTOOLS} index "{output.BAM}" """ -if config.general["aligner"] == "bwa": +bwa_idx_ext = [".bwt", ".amb", ".ann", ".pac", ".sa"] +bowtie_idx_ext = [".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", ".rev.1.bt2", ".rev.2.bt2"] + + +rule ref_bwa_index: + input: + "{file}", + output: + multiext("{file}", *bwa_idx_ext), + params: + BWA=config.applications["bwa"], + log: + outfile="{file}_bwa_index.out.log", + errfile="{file}_bwa_index.err.log", + conda: + config.ref_bwa_index["conda"] + benchmark: + "{file}_bwa_index.benchmark" + group: + "align" + resources: + disk_mb=1250, + mem_mb=config.ref_bwa_index["mem"], + time_min=config.ref_bwa_index["time"], + threads: 1 + shell: + """ + {params.BWA} index {input} 2> >(tee {log.errfile} >&2) + """ + - def input_align_gz(wildcards): - list_output = [] +def input_align_gz(wildcards): + list_output = [] + list_output.append(os.path.join(wildcards.dataset, "preprocessed_data/R1.fastq.gz")) + if config.input["paired"]: list_output.append( - os.path.join(wildcards.dataset, "preprocessed_data/R1.fastq.gz") + os.path.join(wildcards.dataset, "preprocessed_data/R2.fastq.gz") ) - if config.input["paired"]: - list_output.append( - os.path.join(wildcards.dataset, "preprocessed_data/R2.fastq.gz") - ) - return list_output + return list_output - rule ref_bwa_index: - input: - reference_file, - output: - "{}.bwt".format(reference_file), - params: - BWA=config.applications["bwa"], - log: - outfile="references/bwa_index.out.log", - errfile="references/bwa_index.err.log", - conda: - config.ref_bwa_index["conda"] - benchmark: - "references/ref_bwa_index.benchmark" - group: - "align" - resources: - disk_mb=1250, - mem_mb=config.ref_bwa_index["mem"], - time_min=config.ref_bwa_index["time"], - threads: 1 - shell: - """ - {params.BWA} index {input} 2> >(tee {log.errfile} >&2) - """ +if config.general["aligner"] == "bwa": + + # HACK way too many indexing files (see ref_bwa_index), not using shadow: minimal rule bwa_align: - # all indexing files: .amb .ann .bwt .fai .pac .sa input: FASTQ=input_align_gz, REF=reference_file, - INDEX="{}.bwt".format(reference_file), + INDEX=multiext(reference_file, *bwa_idx_ext), output: - REF=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/REF_aln.sam" - ) - ), - TMP_SAM=temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/alignments/tmp_aln.sam" - ) - ), + REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), + TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), params: EXTRA=config.bwa_align["extra"], FILTER="-f 2" if config.input["paired"] else "-F 4", @@ -522,7 +506,6 @@ if config.general["aligner"] == "bwa": errfile="{dataset}/alignments/bwa_align.err.log", conda: config.bwa_align["conda"] - # shadow: "minimal" # HACK way too many indexing files, using explicit OUT instead benchmark: "{dataset}/alignments/bwa_align.benchmark" group: @@ -546,12 +529,7 @@ elif config.general["aligner"] == "bowtie": input: reference_file, output: - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + multiext(reference_file, *bowtie_idx_ext), params: BOWTIE=config.applications["bowtie_idx"], log: @@ -581,25 +559,10 @@ elif config.general["aligner"] == "bowtie": R1="{dataset}/preprocessed_data/R1.fastq.gz", R2="{dataset}/preprocessed_data/R2.fastq.gz", REF=reference_file, - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + INDEX=multiext(reference_file, *bowtie_idx_ext), output: - REF=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/REF_aln.sam", - ) - ), - TMP_SAM=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/tmp_aln.sam", - ) - ), + REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), + TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), params: PHRED=config.bowtie_align["phred"], PRESET=config.bowtie_align["preset"], @@ -626,7 +589,6 @@ elif config.general["aligner"] == "bowtie": {params.BOWTIE} -x {input.REF} -1 {input.R1} -2 {input.R2} {params.PHRED} {params.PRESET} -X {params.MAXINS} {params.EXTRA} -p {threads} -S {output.TMP_SAM} 2> >(tee {log.errfile} >&2) # Filter alignments: (1) keep only reads mapped in proper pairs, and (2) remove supplementary aligments {params.SAMTOOLS} view -h -f 2 -F 2048 -o "{output.REF}" "{output.TMP_SAM}" 2> >(tee -a {log.errfile} >&2) - rm {params.TMP_SAM} """ @@ -636,25 +598,10 @@ elif config.general["aligner"] == "bowtie": input: R1="{dataset}/preprocessed_data/R1.fastq.gz", REF=reference_file, - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + INDEX=multiext(reference_file, *bowtie_idx_ext), output: - REF=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/REF_aln.sam", - ) - ), - TMP_SAM=temp( - os.path.join( - config.general["temp_prefix"], - "{dataset}/alignments/tmp_aln.sam", - ) - ), + REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"), + TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"), params: PHRED=config.bowtie_align["phred"], PRESET=config.bowtie_align["preset"], @@ -680,11 +627,11 @@ elif config.general["aligner"] == "bowtie": {params.BOWTIE} -x {input.REF} -U {input.R1} {params.PHRED} {params.PRESET} {params.EXTRA} -p {threads} -S {output.TMP_SAM} 2> >(tee {log.errfile} >&2) # Filter alignments: (1) remove unmapped reads, and (2) remove supplementary aligments {params.SAMTOOLS} view -h -F 4 -F 2048 -o "{output.REF}" "{output.TMP_SAM} 2> >(tee -a {log.errfile} >&2) - rm {params.TMP_SAM} """ +# NOTE ngshmmalignb also generate consensus so check there too. + -# NOTE ngshmmalignb also generate consensus so check there too. # if config.general["aligner"] == "ngshmmalign": # # ruleorder: convert_to_ref > sam2bam diff --git a/workflow/rules/clean.smk b/workflow/rules/clean.smk index 7d0287df1..0931903f7 100644 --- a/workflow/rules/clean.smk +++ b/workflow/rules/clean.smk @@ -60,7 +60,7 @@ rule alignclean: rule bwaclean: input: - "{}.bwt".format(reference_file), + INDEX=multiext(reference_file, *bwa_idx_ext), params: DIR=config.output["datadir"], shell: @@ -74,12 +74,7 @@ rule bwaclean: rule bowtieclean: input: - INDEX1="{}.1.bt2".format(reference_file), - INDEX2="{}.2.bt2".format(reference_file), - INDEX3="{}.3.bt2".format(reference_file), - INDEX4="{}.4.bt2".format(reference_file), - INDEX5="{}.rev.1.bt2".format(reference_file), - INDEX6="{}.rev.2.bt2".format(reference_file), + INDEX=multiext(reference_file, *bowtie_idx_ext), params: DIR=config.output["datadir"], shell: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 29956ee2a..5eac4171b 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -6,9 +6,12 @@ __email__ = "v-pipe@bsse.ethz.ch" import csv import os +import pathlib import re import typing +from datetime import datetime, timezone + # for legacy import configparser import json # NOTE jsonschema is only part of the full snakemake, not snakemake-minimal @@ -245,15 +248,20 @@ def process_config(config): section[entry] = value.format(VPIPE_BASEDIR=VPIPE_BASEDIR) """ - The following hack supports `.` access to dictionary entries, e.g. - config.input instead of config["input"] so we don't have to change all - existing rules. - This works only on the upper-level of config and not for the nested - dictionaries. + The following hack supports `.` access to recursive dictionary entries, + e.g. config.input instead of config["input"] so we don't have to change + all existing rules. """ - config = UserDict(config) - config.__dict__.update(config) - return config + + def wrap(dd): + if isinstance(dd, dict): + dd = UserDict(dd) + for k, v in dd.items(): + dd[k] = wrap(v) + dd.__dict__.update(dd) + return dd + + return wrap(config) config = process_config(config) @@ -357,6 +365,9 @@ visualizations = [] diversity_measures = [] datasets = [] IDs = [] +dehumanized_raw_reads = [] +upload_markers = [] + for p in patient_list: # WARNING the following makes sure to gracefully handle trailing slashes in the user-provided paths in datadir sdir = os.path.join(config.output["datadir"], p.patient_id, p.date) @@ -370,7 +381,9 @@ for p in patient_list: references.append(os.path.join(sdir, "references/ref_")) consensus.append(os.path.join(sdir, "references/ref_ambig.fasta")) + consensus.append(os.path.join(sdir, "references/ref_ambig_dels.fasta")) consensus.append(os.path.join(sdir, "references/ref_majority.fasta")) + consensus.append(os.path.join(sdir, "references/ref_majority_dels.fasta")) consensus.append(os.path.join(sdir, "references/consensus.bcftools.fasta")) @@ -427,8 +440,22 @@ for p in patient_list: visualizations.append(os.path.join(sdir, "visualization/snv_calling.html")) visualizations.append(os.path.join(sdir, "visualization/alignment.html")) + # upload related stuff + if config.output["dehumanized_raw_reads"]: + dehumanized_raw_reads.append(os.path.join(sdir, "raw_uploads", "dehuman.cram")) + + if config.output["upload"]: + upload_markers.append(os.path.join(sdir, "upload_prepared.touch")) + # merge lists containing expected output - all_files = alignments + consensus + results + visualizations + all_files = ( + alignments + + consensus + + results + + visualizations + + dehumanized_raw_reads + + upload_markers + ) # diversity measures if not config.output["snv"] and config.output["diversity"]: @@ -474,6 +501,13 @@ if not VPIPE_BENCH: else: reference_name = get_reference_name(reference_file) + if config.output["dehumanized_raw_reads"] and not os.path.isfile( + config.dehuman["ref_host"] + ): + LOGGER.warning( + f"WARNING: Host organism reference file {config.dehuman['ref_host']} not found, it will be downloaded from {config.dehuman['ref_host_url']}." + ) + # Auxiliary functions @@ -505,7 +539,7 @@ def shifts(wildcards): def get_maxins(wildcards): - if config.bowtie_align["maxins"]: + if "maxins" in config["bowtie_align"]: return config.bowtie_align["maxins"] else: patient_ID, date = os.path.normpath(wildcards.dataset).split(os.path.sep)[-2:] @@ -518,7 +552,33 @@ def rebase_datadir(base, dataset): return os.path.join(base, *os.path.normpath(dataset).split(os.path.sep)[-2:]) +def raw_data_file(wildcards, pair): + indir = os.path.join( + rebase_datadir(config.input["datadir"], wildcards.dataset), "raw_data" + ) + list_output = [] + rx_fq = re.compile( + r"[^/]*{}\.(fastq\.gz|fastq|fq|fq\.gz)$".format( + ("R%u%s" % (pair, config.input["fastq_suffix"])) if pair else "" + ) + ) + for p in os.listdir(indir): + if rx_fq.search(p): + list_output.append(os.path.join(indir, p)) + + if len(list_output) == 0: + raise ValueError( + "Missing input files for sample in: {} - Unexpected file name?".format( + indir + ) + ) + + return list_output + + +# TODO replace with raw_data_file def construct_input_fastq(wildcards): + indir = os.path.join( rebase_datadir(config.input["datadir"], wildcards.dataset), "raw_data" ) @@ -564,3 +624,11 @@ def construct_input_fastq(wildcards): ) return list_output + + +def temp_prefix(p): + return os.path.join(config.general["temp_prefix"], p) + + +def temp_with_prefix(p): + return temp(temp_prefix(p)) diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk index bdacd0c6e..bc23d70e4 100644 --- a/workflow/rules/consensus.smk +++ b/workflow/rules/consensus.smk @@ -101,6 +101,7 @@ rule consensus_sequences: REF=reference_file, output: REF_amb="{dataset}/references/ref_ambig.fasta", + REF_amb_dels="{dataset}/references/ref_ambig_dels.fasta", REF_majority="{dataset}/references/ref_majority.fasta", REF_majority_dels="{dataset}/references/ref_majority_dels.fasta", params: diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk new file mode 100644 index 000000000..2c2f0dbc6 --- /dev/null +++ b/workflow/rules/dehuman.smk @@ -0,0 +1,314 @@ +from functools import partial + + +localrules: + download_host_ref, + + +rule dh_reuse_alignreject: + # "rely on aligner's output". + # this rule re-use the rejected reads in align.smk (e.g. ngshmmalign's /alignments/rejects.sam) + # (useful when in parallel with the main processing) + input: + reject_aln=rules.hmm_align.output.reject_aln + if config["general"]["aligner"] == "ngshmmalign" + else temp_prefix("{dataset}/alignments/tmp_aln.sam"), + output: + reject_1=temp_with_prefix("{dataset}/alignments/reject_R1.fastq.gz"), + reject_2=temp_with_prefix("{dataset}/alignments/reject_R2.fastq.gz"), + params: + SAMTOOLS=config.applications["samtools"], + conda: + config.dehuman["conda"] + group: + "align" + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.bwa_align["threads"] + shell: + """ + echo "Keep reject -----------------------------------------------------" + echo + + {params.SAMTOOLS} bam2fq -@ {threads} \ + -F 2 \ + -1 {output.reject_1} \ + -2 {output.reject_2} \ + {input.reject_aln} + echo + """ + + +rule dh_redo_alignreject: + # "redo rejects" + # this rule re-does an alignment to get the reject + # (useful if aligner's (temp) rejects have been deleted by now) + input: + global_ref=reference_file, + ref_index=multiext(reference_file, *bwa_idx_ext), + fastq=input_align_gz, + output: + tmp_aln=temp_with_prefix("{dataset}/alignments/dh_aln.sam"), + reject_1=temp_with_prefix("{dataset}/alignments/reject_R1.fastq.gz"), + reject_2=temp_with_prefix("{dataset}/alignments/reject_R2.fastq.gz"), + params: + BWA=config.applications["bwa"], + SAMTOOLS=config.applications["samtools"], + conda: + config.dehuman["conda"] + group: + "dehuman" + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.bwa_align["threads"] + shell: + """ + echo "Filter out virus' reads -----------------------------------------" + echo + + {params.BWA} mem -t {threads} \ + -o {output.tmp_aln} \ + {input.global_ref} {input.fastq} + + echo + echo "Keep reject -----------------------------------------------------" + echo + + {params.SAMTOOLS} bam2fq -@ {threads} \ + -F 2 \ + -1 {output.reject_1} \ + -2 {output.reject_2} \ + {output.tmp_aln} + echo + """ + + +rule download_host_ref: + output: + host_ref=config.dehuman["ref_host"], + params: + host_ref_url=config.dehuman["ref_host_url"], + shell: + """ + curl --output "{output.host_ref}" "{params.host_ref_url}" + """ + + +rule dh_hostalign: + input: + host_ref=config.dehuman["ref_host"], + ref_index=multiext(config.dehuman["ref_host"], *bwa_idx_ext), + reject_1=rules.dh_redo_alignreject.output.reject_1 + if config["dehuman"]["catchup"] + else rules.dh_reuse_alignreject.output.reject_1, + reject_2=rules.dh_redo_alignreject.output.reject_2 + if config["dehuman"]["catchup"] + else rules.dh_reuse_alignreject.output.reject_2, + output: + host_aln=temp_with_prefix("{dataset}/alignments/host_aln.sam"), + params: + BWA=config.applications["bwa"], + conda: + config.dehuman["conda"] + group: + "dehuman" + resources: + disk_mb=1250, + mem_mb=config.dehuman["mem"], + time_min=config.dehuman["time"], + threads: config.dehuman["threads"] + shell: + # create index if not exists: + # test -f {input.ref_index} || {params.BWA} index {input.host_ref} + """ + echo "Checking rejects against host's genome --------------------------" + + {params.BWA} mem -t {threads} \ + -o {output.host_aln}\ + {input.host_ref} {input.reject_1} {input.reject_2} + + echo + """ + + +rule dh_filter: + input: + host_aln=temp_prefix("{dataset}/alignments/host_aln.sam"), + R1=partial(raw_data_file, pair=1) + if config["dehuman"]["catchup"] + else temp_prefix("{dataset}/extracted_data/R1.fastq"), + R2=partial(raw_data_file, pair=2) + if config["dehuman"]["catchup"] + else temp_prefix("{dataset}/extracted_data/R2.fastq"), + output: + filter_count="{dataset}/alignments/dehuman.count", + filter_list=temp_with_prefix("{dataset}/alignments/dehuman.filter"), + # TODO shift to pipe + filtered_1=temp_with_prefix("{dataset}/raw_uploads/filtered_1.fastq.gz"), + filtered_2=temp_with_prefix("{dataset}/raw_uploads/filtered_2.fastq.gz"), + params: + SAMTOOLS=config.applications["samtools"], + remove_reads_script=cachepath( + "../scripts/remove_reads_list.pl", executable=True, localsource=True + ), + # TODO shoo out the cats + keep_host=int(config.dehuman["keep_host"]), + sort_tmp=temp_prefix("{dataset}/alignments/host_sort.tmp"), + host_aln_cram="{dataset}/alignments/host_aln.cram", + # set to 1 to trigger matches with human genome (used for testing): + F=2, + conda: + config.dehuman["conda"] + group: + "dehuman" + resources: + disk_mb=1250, + mem_mb=config.dehuman["mem"], + time_min=config.dehuman["time"], + threads: config.dehuman["threads"] + shell: + """ + # using zcat FILENAME.gz causes issues on Mac, see + # https://serverfault.com/questions/570024/ + # redirection fixes this: + unpack_rawreads() {{ + for fq in "${{@}}"; do + zcat -f < "${{fq}}" + done + }} + + echo + echo "Count aligned reads ---------------------------------------------" + echo + + count=$({params.SAMTOOLS} view -@ {threads} -c -f {params.F} {input.host_aln} | tee {output.filter_count}) + + if (( count > 0 )); then + echo + echo "-----------------------------------------------------------------" + echo "Needs special care: ${{count}} potential human reads found" + echo "-----------------------------------------------------------------" + echo + echo "Removing identified host reads from raw reads -------------------" + echo + + # get list + {params.SAMTOOLS} view -@ {threads} \ + -f {params.F} \ + {input.host_aln} \ + | cut -f 1 > {output.filter_list} + + unpack_rawreads {input.R1} \ + | {params.remove_reads_script} {output.filter_list} \ + | gzip \ + > {output.filtered_1} & + + unpack_rawreads {input.R2} \ + | {params.remove_reads_script} {output.filter_list} \ + | gzip \ + > {output.filtered_2} & + + wait + + if (( {params.keep_host} )); then + # keep the rejects for further analysis + + echo + echo "Keeping host-aligned virus' rejects ------------------------------" + echo + + + # (we compress reference-less, because the reference size is larger + # than the contaminant reads) + FMT=cram,no_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + {params.SAMTOOLS} view -@ {threads} \ + -h -f 2 \ + {input.host_aln} \ + | {params.SAMTOOLS} sort -@ {threads} \ + -T {params.sort_tmp} \ + --output-fmt ${{FMT}} \ + -o {params.host_aln_cram} + + echo + echo "Compressing host-depleted raw reads ------------------------------" + echo + + {params.SAMTOOLS} index -@ {threads} {params.host_aln_cram} + fi + else + echo + echo "No potential human reads found -----------------------------------" + echo "Copy raw reads file" + echo + unpack_rawreads {input.R1} | gzip > {output.filtered_1} & + unpack_rawreads {input.R2} | gzip > {output.filtered_2} & + wait + touch {output.filter_list} + fi + echo + """ + + +# TODO move cram compression into align.smk + + +rule dehuman: + input: + global_ref=reference_file, + ref_index=multiext(reference_file, *bwa_idx_ext), + filtered_1=rules.dh_filter.output.filtered_1, # =temp_prefix("{dataset}/raw_uploads/filtered_1.fastq.gz"), + filtered_2=rules.dh_filter.output.filtered_2, # =temp_prefix("{dataset}/raw_uploads/filtered_2.fastq.gz"), + output: + cram_sam=temp_with_prefix("{dataset}/raw_uploads/dehuman.sam"), + final_cram="{dataset}/raw_uploads/dehuman.cram", + checksum="{dataset}/raw_uploads/dehuman.cram.%s" % config.general["checksum"], + params: + BWA=config.applications["bwa"], + SAMTOOLS=config.applications["samtools"], + checksum_type=config.general["checksum"], + sort_tmp=temp_prefix("{dataset}/raw_uploads/dehuman.tmp"), + conda: + config.dehuman["conda"] + group: + "dehuman" + resources: + disk_mb=1250, + mem_mb=config.dehuman["mem"], + time_min=config.dehuman["time"], + threads: config.dehuman["threads"] + shell: + """ + echo "Compress filtered sequences --------------------------------------" + echo + + {params.BWA} mem -t {threads} \ + -C \ + -o {output.cram_sam} \ + {input.global_ref} {input.filtered_1} {input.filtered_2} + + # HACK handle incompatibilities between: + # - Illumina's 'bcl2fastq', which write arbitrary strings + # - 'bwa mem' which keep comments in the SAM file verbatim as in the FASTQ file + # - 'samtools' which expects comment to be properly marked as 'BC:Z:' + # as per SAM format specs + REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:([ATCGN]+(\+[ATCGN]+)?|[[:digit:]]+))$}}{{BC:Z:\\1}}\' + FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + + perl -p -e ${{REGEXP}} {output.cram_sam} \ + | {params.SAMTOOLS} sort -@ {threads} \ + -T {params.sort_tmp} \ + -M \ + --reference {input.global_ref} \ + --output-fmt ${{FMT}} \ + -o {output.final_cram} + + {params.checksum_type}sum {output.final_cram} > {output.checksum} + + echo + echo DONE ------------------------------------------------------------- + echo + """ diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk new file mode 100644 index 000000000..cbe12e32f --- /dev/null +++ b/workflow/rules/publish.smk @@ -0,0 +1,157 @@ +if config.upload["local"]: + + localrules: + prepare_upload, + + +def bcft_suffix(): + return ( + "" + if config.upload["consensus"] == "majority" + else f'_{config.upload["consensus"]}' + ) + + +rule prepare_upload: + input: + R1=partial(raw_data_file, pair=1) if config.upload["orig_fastq"] else [], + R2=partial(raw_data_file, pair=2) if config.upload["orig_fastq"] else [], + orig_cram=[ + "{dataset}/raw_uploads/raw_reads.cram", + "{dataset}/raw_uploads/raw_reads.cram.%s" % config.general["checksum"], + ] + if config.upload["orig_cram"] + else [], + dehuman=[ + "{dataset}/raw_uploads/dehuman.cram", + "{dataset}/raw_uploads/dehuman.cram.%s" % config.general["checksum"], + ] + if config.output["dehumanized_raw_reads"] + else [], + consensus_indels="{dataset}/references/consensus%s.bcftools.fasta" + % bcft_suffix(), + consensus_indels_chain="{dataset}/references/consensus%s.bcftools.chain" + % bcft_suffix(), + consensus_aligned="{dataset}/references/ref_%s_dels.fasta" + % config.upload["consensus"], + csum=[ + "{dataset}/references/consensus%(suf)s.bcftools.fasta.%(csum)s" + % {"suf": bcft_suffix(), "csum": config.general["checksum"]}, + "{dataset}/references/ref_majority_dels.fasta.%(csum)s" + % {"suf": bcft_suffix(), "csum": config.general["checksum"]}, + ] + if config.upload["checksum"] + else [], + frameshift_deletions_check="{dataset}/references/frameshift_deletions_check.tsv" + if config.output["QA"] + else [], + output: + upload_prepared_touch="{dataset}/upload_prepared.touch", + params: + sample_id=ID, + script=cachepath(config.upload["script"], executable=True), + options=config.upload["options"], + conda: + # NOTE realpath is a gnu coreutils executable and not available out of the box. We need a conda environment anyway + config.upload["conda"] + resources: + disk_mb=1000, + mem_mb=config.upload["mem"], + time_min=config.upload["time"], + threads: config.upload["threads"] + shell: + """ + {params.script} {params.options} "{output.upload_prepared_touch}" "{params.sample_id}" "{wildcards.dataset}" {input} + """ + + +rule unfiltered_cram: + input: + global_ref=reference_file, + ref_index=multiext(reference_file, *bwa_idx_ext), + # NOTE not relying on rule extract - if done as a catchup these might not exist + R1=partial(raw_data_file, pair=1), + R2=partial(raw_data_file, pair=2), + output: + cram_sam=temp_with_prefix("{dataset}/raw_uploads/raw_reads.sam"), + final_cram="{dataset}/raw_uploads/raw_reads.cram", + checksum="{dataset}/raw_uploads/raw_reads.cram.%s" % config.general["checksum"], + params: + BWA=config.applications["bwa"], + SAMTOOLS=config.applications["samtools"], + checksum_type=config.general["checksum"], + sort_tmp=temp_prefix("{dataset}/raw_uploads/raw_reads.tmp"), + conda: + config.dehuman["conda"] + resources: + disk_mb=1250, + mem_mb=config.bwa_align["mem"], + time_min=config.bwa_align["time"], + threads: config.bwa_align["threads"] + shell: + """ + # using zcat FILENAME.gz causes issues on Mac, see + # https://serverfault.com/questions/570024/ + # redirection fixes this: + unpack_rawreads() {{ + for fq in "${{@}}"; do + zcat -f < "${{fq}}" + done + }} + + echo "Compress un-filtered sequences -----------------------------------" + echo + + {params.BWA} mem -t {threads} \ + -C \ + -o {output.cram_sam} \ + {input.global_ref} \ + <(unpack_rawreads {input.R1}) \ + <(unpack_rawreads {input.R2}) + + # HACK handle incompatibilities between: + # - Illumina's 'bcl2fastq', which write arbitrary strings + # - 'bwa mem' which keep comments in the SAM file verbatim as in the FASTQ file + # - 'samtools' which expects comment to be properly marked as 'BC:Z:' + # as per SAM format specs + REGEXP=\'s{{(?<=\\t)([[:digit:]]:[[:upper:]]:[[:digit:]]:([ATCGN]+(\+[ATCGN]+)?|[[:digit:]]+))$}}{{BC:Z:\\1}}\' + FMT=cram,embed_ref,use_bzip2,use_lzma,level=9,seqs_per_slice=1000000 + + perl -p -e ${{REGEXP}} {output.cram_sam} \ + | {params.SAMTOOLS} sort -@ {threads} \ + -T {params.sort_tmp} \ + -M \ + --reference {input.global_ref} \ + --output-fmt ${{FMT}} \ + -o {output.final_cram} + + {params.checksum_type}sum {output.final_cram} > {output.checksum} + + echo + echo DONE ------------------------------------------------------------- + echo + """ + + +rule checksum: + input: + "{file}", + output: + "{file}.%s" % config.general["checksum"], + params: + checksum_type=config.general["checksum"], + conda: + config["upload"]["conda"] + resources: + disk_mb=10, + mem_mb=config.checksum["mem"], + time_min=config.checksum["time"], + threads: 1 + shell: + """ + {params.checksum_type}sum {input} > {output} + """ + + +ruleorder: unfiltered_cram > checksum +ruleorder: dehuman > checksum diff --git a/workflow/rules/quality_assurance.smk b/workflow/rules/quality_assurance.smk index 125419eca..e5e0bdd0a 100644 --- a/workflow/rules/quality_assurance.smk +++ b/workflow/rules/quality_assurance.smk @@ -31,13 +31,10 @@ rule gunzip: rule extract: input: + # TODO replace with raw_data_file construct_input_fastq, output: - temp( - os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R{pair}.fastq" - ) - ), + temp_with_prefix("{dataset}/extracted_data/R{pair}.fastq"), log: outfile="{dataset}/extracted_data/extract_R{pair}.out.log", errfile="{dataset}/extracted_data/extract_R{pair}.err.log", @@ -51,6 +48,7 @@ rule extract: time_min=config.extract["time"], threads: 1 shell: + # TODO replace with better dedicated software """ cat {input} | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > {output} 2> >(tee {log.errfile} >&2) """ @@ -71,12 +69,8 @@ if config.input["paired"]: rule preprocessing: input: - R1=os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R1.fastq" - ), - R2=os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R2.fastq" - ), + R1=temp_prefix("{dataset}/extracted_data/R1.fastq"), + R2=temp_prefix("{dataset}/extracted_data/R2.fastq"), output: R1gz="{dataset}/preprocessed_data/R1.fastq.gz", R2gz="{dataset}/preprocessed_data/R2.fastq.gz", @@ -93,7 +87,6 @@ if config.input["paired"]: "minimal" benchmark: "{dataset}/preprocessed_data/prinseq.benchmark" - # group: 'preprocessing' resources: disk_mb=20000, mem_mb=config.preprocessing["mem"], @@ -127,9 +120,7 @@ else: rule preprocessing_se: input: - R1=os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R1.fastq" - ), + R1=temp_prefix("{dataset}/extracted_data/R1.fastq"), output: R1gz="{dataset}/preprocessed_data/R1.fastq.gz", params: @@ -145,7 +136,6 @@ else: "minimal" benchmark: "{dataset}/preprocessed_data/prinseq.benchmark" - # group: 'preprocessing' resources: disk_mb=10000, mem_mb=config.preprocessing["mem"], @@ -170,14 +160,13 @@ else: gzip {wildcards.dataset}/preprocessed_data/R1.fastq """ +# 3. QC reports + -# 3. QC reports rule fastqc: input: - os.path.join( - config.general["temp_prefix"], "{dataset}/extracted_data/R{pair}.fastq" - ), + temp_prefix("{dataset}/extracted_data/R{pair}.fastq"), output: "{dataset}/extracted_data/R{pair}_fastqc.html", params: diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json index 95bfb5da0..d82c4c3de 100644 --- a/workflow/schemas/config_schema.json +++ b/workflow/schemas/config_schema.json @@ -6,7 +6,7 @@ "examples": [{"input":{"samples_file":"samples.tsv"}}], "properties": { "general": { - "description": "This section of the configuration provides general options that control the overall behavior of the pipeline", + "description": "This section of the configuration provides general options that control the overall behavior of the pipeline.", "properties": { "virus_base_config": { "type": "string", @@ -18,7 +18,7 @@ "type": "string", "enum": ["ngshmmalign","bwa","bowtie"], "default": "ngshmmalign", - "description": "There are three options for mapping reads, either using [`ngshmmalign`](https://github.com/cbg-ethz/ngshmmalign), [BWA MEM (`bwa`)](https://github.com/lh3/bwa) [^1], or [Bowtie 2 (`bowtie`)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) [^2]. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.\n\n[^1]: Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013.\n[^2]: Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012.\n\n**Note**: Some virus-specific base configuration specified in `virus_base_config` might change this option's default to a more appropriate aligner for that virus, e.g, depending on its usual diversity and mutation rate.\nYou are still free to override that default in your configuration shall the need arise.", + "description": "There are three options for mapping reads, either using [`ngshmmalign`](https://github.com/cbg-ethz/ngshmmalign), [BWA MEM (`bwa`)](https://github.com/lh3/bwa) [^1], or [Bowtie 2 (`bowtie`)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) [^2]. To use a different aligner than the default, indicate which aligner you want to use by setting the property aligner.\n\n[^1]: Li, H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. 2013.\n[^2]: Langmead, B. and Salzberg, S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012.\n\n**Note**: Some virus-specific base configuration specified in `virus_base_config` might change this option's default to a more appropriate aligner for that virus, e.g., depending on its usual diversity and mutation rate.\nYou are still free to override that default in your configuration shall the need arise.", "examples": ["bowtie"] }, "snv_caller": { @@ -41,6 +41,13 @@ "description": "This option should be used to specify the default number of threads for all multi-threaded rules. That is, unless the number of threads is specified for each rule, this value is set as default.", "examples": [4] }, + "checksum": { + "type": "string", + "default": "md5", + "enum": ["md5", "sha1", "sha256", "sha224", "sha384", "sha512", "xxh64", "xxh32", "xxh128"], + "description": "Sets the algorithm to be used when computing checksums for uploadable data.", + "examples": ["sha256"] + }, "temp_prefix": { "type": "string", "default": "", @@ -52,18 +59,18 @@ "type": "object" }, "input": { - "description": "Properties in this section of the configuration control the input of the pipeline", + "description": "Properties in this section of the configuration control the input of the pipeline.", "properties": { "datadir": { "type": "string", "default": "samples/", - "description": "The input file for the workflow will be searched in this directory.\n\nV-pipe expects the input samples to be organized in a two-level directory hierarchy.\n\n - The first level can be, e.g., patient samples or biological replicates of an experiment.\n - The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.\n - Inside that directory, the sub-directory `raw_data` holds the sequencing data in FASTQ format (optionally compressed with GZip).\n\nFor example:\n```console\nsamples\n├── patient1\n│ ├── 20100113\n│ │ └──raw_data\n│ │ ├──patient1_20100113_R1.fastq\n│ │ └──patient1_20100113_R2.fastq\n│ └── 20110202\n│ └──raw_data\n│ ├──patient1_20100202_R1.fastq\n│ └──patient1_20100202_R2.fastq\n└── patient2\n └── 20081130\n └──raw_data\n ├──patient2_20081130_R1.fastq.gz\n └──patient2_20081130_R2.fastq.gz\n```\n\n", + "description": "The input file for the workflow will be searched in this directory.\n\nV-pipe expects the input samples to be organized in a two-level directory hierarchy.\n\n - The first level can be, e.g., patient samples or biological replicates of an experiment.\n - The second level can be, e.g., different sampling dates or different sequencing runs of the same sample.\n - Inside that directory, the sub-directory `raw_data` holds the sequencing data in FASTQ format (optionally compressed with GZip).\n\nFor example:\n```console\nsamples\n├── patient1\n│ ├── 20100113\n│ │ └──raw_data\n│ │ ├──patient1_20100113_R1.fastq\n│ │ └──patient1_20100113_R2.fastq\n│ └── 20110202\n│ └──raw_data\n│ ├──patient1_20100202_R1.fastq\n│ └──patient1_20100202_R2.fastq\n└── patient2\n └── 20081130\n └──raw_data\n ├──patient2_20081130_R1.fastq.gz\n └──patient2_20081130_R2.fastq.gz\n```", "examples": [ "tests/data/hiv/", "tests/data/sars-cov-2/" ] }, "paired": { "type": "boolean", "default": true, - "description": "Indicate whether the input sequencing reads correspond to paired-end reads.\n\nPaired-ended reads need to be in split files with `_R1` and `_R2` suffixes:\n```console\nraw_data\n├──patient2_20081130_R1.fastq.gz\n└──patient2_20081130_R2.fastq.gz\n```\n\n", + "description": "Indicate whether the input sequencing reads correspond to paired-end reads.\n\nPaired-ended reads need to be in split files with `_R1` and `_R2` suffixes:\n```console\nraw_data\n├──patient2_20081130_R1.fastq.gz\n└──patient2_20081130_R2.fastq.gz\n```", "examples": [ false ] }, "fastq_suffix": { @@ -75,7 +82,7 @@ "samples_file": { "type": "string", "default": "config/samples.tsv", - "description": "File containing sample unique identifiers and dates as tab-separated values, e.g.,\n```tsv\npatient1 20100113\npatient1 20110202\npatient2 20081130\n```\n\nHere, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.\n\nOptionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.\n\nStandardized Snakemake workflows place their tables inside the `config/` subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x)\n", + "description": "File containing sample unique identifiers and dates as tab-separated values, e.g.,\n```tsv\npatient1 20100113\npatient1 20110202\npatient2 20081130\n```\n\nHere, we have two samples from patient 1 and one sample from patient 2. By default, V-pipe searches for a file named samples.tsv, if this file does not exist, a list of samples is built by globbing datadir directory contents.\n\nOptionally, the samples file can contain a third column specifying the read length. This is particularly useful when samples are sequenced using protocols with different read lengths.\n\nStandardized Snakemake workflows place their tables inside the `config/` subdirectory, but using this options you can specify alternate locations, e.g., the current working directory (as done in legacy V-pipe v1.x/2.x).", "examples": [ "samples.tsv" ] }, "read_length": { @@ -125,7 +132,7 @@ "type": "object" }, "output": { - "description": "Properties in this section of the configuration control the output of the pipeline", + "description": "Properties in this section of the configuration control the output of the pipeline.", "properties": { "datadir": { "type": "string", @@ -136,37 +143,37 @@ "cohortdir": { "type": "string", "default": "", - "description": "In addition, V-pipe can optionally generate a few cohort-wide results, such as a current cohort consensus fasta file, or a TSV file containing the frequencies of all minor alleles that differ from the consensus among analyzed samples.\nBy default, these output files are located at the base of the [`output` `datadir`](#output_datadir), outside of the two-level per sample structure:\n```console\nresults\n├──minority_variants.tsv\n├──cohort_consensus.fasta\n├──patient1\n│ ├──20100113\n│ │ ├──alignments\n…\n```\n\nIf you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the `datadir`s, you can use this options you can specify alternate subdirectory relative to the `datadir` property. (Use `..` prefix if you want instead your cohort-wide results to be in a directory at the sample level as `samples/` and `results/`. See the example below to recreate the `variants/` directory used by legacy V-pipe v1.x/2.x)", + "description": "In addition, V-pipe can optionally generate a few cohort-wide results, such as a current cohort consensus fasta file, or a TSV file containing the frequencies of all minor alleles that differ from the consensus among analyzed samples.\nBy default, these output files are located at the base of the [`output` `datadir`](#output_datadir), outside of the two-level per sample structure:\n```console\nresults\n├──minority_variants.tsv\n├──cohort_consensus.fasta\n├──patient1\n│ ├──20100113\n│ │ ├──alignments\n…\n```\n\nIf you prefer instead, e.g., such cohort-wide results behind written in a subdirectory of the working directory at the same level as the `datadir`s, you can use this options you can specify alternate subdirectory relative to the `datadir` property. (Use `..` prefix if you want instead your cohort-wide results to be in a directory at the sample level as `samples/` and `results/`. See the example below to recreate the `variants/` directory used by legacy V-pipe v1.x/2.x).", "examples": ["../variants"] }, "QA": { "type": "boolean", "default": false, - "description": "V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample's consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample's `…/references/frameshift_deletions_check.tsv`. Such reports can be useful when submitting sequences to GISAID.\n\nThis option turns on such QA features", + "description": "V-pipe can produce several outputs to assess the quality of the output of its steps, e.g., checking whether a sample's consensus sequence generated by bctfools does result in frameshifting indels and writing a report in sample's `…/references/frameshift_deletions_check.tsv`. Such reports can be useful when submitting sequences to GISAID.\n\nThis option turns on such QA features.", "examples": [true] }, "snv": { "type": "boolean", "default": false, - "description": "This option selects whether the SNV caller step should be executed and its output written to each sample's `…/variants/SNVs/snvs.csv`", + "description": "This option selects whether the SNV caller step should be executed and its output written to each sample's `…/variants/SNVs/snvs.csv`.", "examples": [true] }, "local": { "type": "boolean", "default": false, - "description": "This option activates local haplotype reconstruction (only available when using ShoRAH)", + "description": "This option activates local haplotype reconstruction (only available when using ShoRAH).", "examples": [true] }, "global": { "type": "boolean", "default": false, - "description": "This option turns on global haplotype reconstruction", + "description": "This option turns on global haplotype reconstruction.", "examples": [true] }, "visualization": { "type": "boolean", "default": false, - "description": "This option selects whether to generate HTML visualization of the SNVs in each sample's `…/visualization/index.html`", + "description": "This option selects whether to generate HTML visualization of the SNVs in each sample's `…/visualization/index.html`.", "examples": [true] }, "diversity": { @@ -174,13 +181,25 @@ "default": false, "description": "This option turns on the computation of diversity measures in each sample.", "examples": [true] + }, + "dehumanized_raw_reads": { + "type": "boolean", + "default": false, + "description": "This option turns on dehumanization of the raw reads (i.e. removal of host's reads) and generates the file `dehuman.cram`. This is useful to prepare raw reads for upload on public databases such as, e.g. ENA (European Nucleotide Archive).\n\nThis only applies to the upload and does not affect the main workflow.", + "examples": [true] + }, + "upload": { + "type": "boolean", + "default": false, + "description": "This option can be used for assistance in incremental upload of data. See section `upload` for an example.", + "examples": [true] } }, "default": {}, "type": "object" }, "applications": { - "description": "The path to the different software packages can be specified using this section.\n\nIt is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the `PATH`.\n\n**Note** we strongly recommend to use conda environments, by adding the `--use-conda` flag to the V-pipe execution command, e.g. `./vpipe --use-conda`. If you prefer to use your own installations, this section allows you to specify the location of the executables", + "description": "The path to the different software packages can be specified using this section.\n\nIt is especially useful when dependencies are not obtained via conda such as VICUNA, and when the software packages are not in the `PATH`.\n\n**Note** we strongly recommend to use conda environments, by adding the `--use-conda` flag to the V-pipe execution command, e.g. `./vpipe --use-conda`. If you prefer to use your own installations, this section allows you to specify the location of the executables.", "examples":[{"bwa":"/path/to/bwa","haploclique":"/path/to/haploclique"}], "properties": { "gunzip": { @@ -198,7 +217,7 @@ "vicuna": { "type": "string", "default": "vicuna", - "description":"Due to a special license, VICUNA is **not** available from bioconda and must be installed from [its original website](https://www.broadinstitute.org/viral-genomics/vicuna).\nUse this option to specify where you have installed its executable" + "description":"Due to a special license, VICUNA is **not** available from bioconda and must be installed from [its original website](https://www.broadinstitute.org/viral-genomics/vicuna).\nUse this option to specify where you have installed its executable." }, "indelfixer": { "type": "string", @@ -464,13 +483,13 @@ "leave_msa_temp": { "type": "boolean", "default": false, - "description": "this option is useful for debugging purposes", + "description": "This option is useful for debugging purposes.", "examples": [true] }, "extra": { "type": "string", "default": "", - "description": "pass additional options to run ngshmmalign\n\nV-pipe uses option `-R `, thus option `-r arg` is not allowed. Also, instead of passing `-l` via the property `extra`, set [`leave_msa_temp`](#hmm_align_leave_msa_temp) to `True`. Lastly, please do not modify options `-o arg`, `-w arg`, `-t arg`, and `-N arg`. These are already managed by V-pipe." + "description": "Pass additional options to run ngshmmalign\n\nV-pipe uses option `-R `, thus option `-r arg` is not allowed. Also, instead of passing `-l` via the property `extra`, set [`leave_msa_temp`](#hmm_align_leave_msa_temp) to `True`. Lastly, please do not modify options `-o arg`, `-w arg`, `-t arg`, and `-N arg`. These are already managed by V-pipe." } }, @@ -622,7 +641,7 @@ "extra": { "type": "string", "default": "", - "description": " With property `extra`, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software [documentation](http://bio-bwa.sourceforge.net/bwa.shtml)." + "description": "With property `extra`, users can pass additional options to run BWA MEM. For more details on BWA MEM configurable options refer to the software [documentation](http://bio-bwa.sourceforge.net/bwa.shtml)." } }, "default": {}, @@ -667,13 +686,13 @@ "type": "string", "default": "--phred33", "enum": ["--phred33","--phred64"], - "description":"indicate if qualities are Phred+33 (default) or Phred+64 (`--phred64`)", + "description":"Indicate if qualities are Phred+33 (default) or Phred+64 (`--phred64`).", "examples": ["--phred64"] }, "preset": { "type": "string", "default": "--local --sensitive-local", - "description": "specify Bowtie 2 presets" + "description": "Specify Bowtie 2 presets." }, "maxins": { "type": "integer" @@ -681,7 +700,7 @@ "extra": { "type": "string", "default": "", - "description": "pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options\nFor more details on Bowtie 2 configurable options refer to the software [documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)." + "description": "Pass additional options to run Bowtie 2. V-pipe handles the input and output files, as well as the reference sequence. Thus, do not modify these options\nFor more details on Bowtie 2 configurable options refer to the software [documentation](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml)." } }, "default": {}, @@ -704,22 +723,22 @@ "min_coverage": { "type": "integer", "default": 50, - "description": "minimum read depth for reporting variants per locus" + "description": "Minimum read depth for reporting variants per locus." }, "n_coverage": { "type": "integer", "default": 5, - "description": "read count below which ambiguous base 'n' is reported" + "description": "Read count below which ambiguous base 'n' is reported." }, "qual_thrd": { "type": "integer", "default": 15, - "description": "minimum phred quality score for a base to be included" + "description": "Minimum phred quality score for a base to be included." }, "min_freq": { "type": "number", "default": 0.05, - "description": "minimum frequency for an ambiguous nucleotide" + "description": "Minimum frequency for an ambiguous nucleotide." } }, "default": {}, @@ -866,13 +885,13 @@ "min_coverage": { "type": "integer", "default": 100, - "description": "minimum read depth for reporting variants per locus", + "description": "Minimum read depth for reporting variants per locus.", "examples": [50] }, "frequencies": { "type": "boolean", "default": false, - "description": "output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples", + "description": "Output a numpy array file containing frequencies of all bases, including gaps and also the most abundant base across samples.", "examples": [true] } }, @@ -899,19 +918,19 @@ "overlap": { "type": "boolean", "default": false, - "description": "construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth", + "description": "Construct intervals based on overlapping windows of the read alignment. By default, regions with high coverage are built based on the position-wise read depth.", "examples": [true] }, "coverage": { "type": "integer", "default": 50, - "description": "minimum read depth. A region spanning the reference genome is returned if `coverage` is set to 0", + "description": "Minimum read depth. A region spanning the reference genome is returned if `coverage` is set to 0.", "examples": [ 0 ] }, "liberal": { "type": "boolean", "default": true, - "description": "indicate whether to apply a more liberal shifting on intervals' right-endpoint", + "description": "Indicate whether to apply a more liberal shifting on intervals' right-endpoint.", "examples": [false] } }, @@ -938,18 +957,18 @@ "consensus": { "type": "boolean", "default": true, - "description": "indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False", + "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", "examples": [false] }, "alpha": { "type": "number", "default": 0.1, - "description": "hyperparameter used for instantiating a new cluster" + "description": "Hyperparameter used for instantiating a new cluster." }, "ignore_indels": { "type": "boolean", "default": false, - "description": "ignore SNVs adjacent to indels" + "description": "Ignore SNVs adjacent to indels." }, "posterior_threshold": { "type": "number", @@ -958,18 +977,18 @@ "coverage": { "type": "integer", "default": 0, - "description": "omit windows with coverage less than this value", + "description": "Omit windows with coverage less than this value.", "examples": [50] }, "shift": { "type": "integer", "default": 3, - "description": "ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows" + "description": "ShoRAH performs local haplotype reconstruction on windows of the read alignment. The overlap between these windows is defined by the window shifts. By default, it is set to 3, i.e., apart from flanking regions each position is covered by 3 windows." }, "keep_files": { "type": "boolean", "default": false, - "description": "indicate whether to move files produced in previous/interrupted runs to subdirectory named `old`", + "description": "Indicate whether to move files produced in previous/interrupted runs to subdirectory named `old`", "examples": [true] }, "localscratch": { @@ -1015,13 +1034,13 @@ "consensus": { "type": "boolean", "default": true, - "description": "indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False", + "description": "Indicate whether to use the cohort-consensus sequence from the analyzed samples (output from `minor_variants` rule located in the cohort-wide output `results/cohort_onsensus.fasta`) or the reference sequence by setting this option to False.", "examples": [false] }, "extra": { "type": "string", "default": "", - "description": "pass additional options to run `lofreq call`" + "description": "Pass additional options to run `lofreq call`" } }, "default": {}, @@ -1080,7 +1099,7 @@ "split": { "type": "integer", "default": 20, - "description": "size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/`split` < 1000" + "description": "Size of the batches of reads to be processed by SAVAGE. It is recommended that 500 < coverage/`split` < 1000." } }, "default": {}, @@ -1103,32 +1122,32 @@ "relax": { "type": "boolean", "default": true, - "description": "if set to `True` (default) a predefined set of parameter values is used for drawing edges between reads in the read graph" + "description": "If set to `True` (default) a predefined set of parameter values is used for drawing edges between reads in the read graph." }, "no_singletons": { "type": "boolean", "default": true, - "description": "singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to `True`, singletons are discarded" + "description": "Singletons are defined as proposed haplotypes which are supported by a single read. If this property is set to `True`, singletons are discarded." }, "no_prob0": { "type": "boolean", "default": true, - "description": "if set to `True` (default) probability of the overhangs is ignored" + "description": "If set to `True` (default) probability of the overhangs is ignored." }, "clique_size_limit": { "type": "integer", "default": 3, - "description": "sets a threshold to limit the size of cliques" + "description": "Sets a threshold to limit the size of cliques." }, "max_num_cliques": { "type": "integer", "default": 10000, - "description": "indicates the maximum number of clique to be considered in the next iteration" + "description": "Indicates the maximum number of clique to be considered in the next iteration." }, "extra_parameters": { "type": "string", "default": "", - "description": "additional parameters to be passed to haploclique.\n\nWarning: this won't overwrite the other options (e.g. `clique_size_limi` and `max_num_cliques` should still be set via their [own](#haploclique_clique_size_limit) [respective](#haploclique_max_num_cliques) properties, do not pass parameters `--limit_clique_size=` nor `--max_cliques=` via this `extra` property)." + "description": "Additional parameters to be passed to haploclique.\n\nWarning: this won't overwrite the other options (e.g. `clique_size_limi` and `max_num_cliques` should still be set via their [own](#haploclique_clique_size_limit) [respective](#haploclique_max_num_cliques) properties, do not pass parameters `--limit_clique_size=` nor `--max_cliques=` via this `extra` property)." } }, "default": {}, @@ -1151,18 +1170,18 @@ "region_start": { "type": "integer", "default": 0, - "description": "use to specify a region of interest" + "description": "Use to specify a region of interest." }, "region_end": { "type": "integer", "default": -1, - "description": "use to specify a region of interest", + "description": "Use to specify a region of interest.", "examples": [9719,29836] }, "msa": { "type": "string", "default": "", - "description": "when the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed" + "description": "When the ground truth is available (e.g., simulation studies), a multiple sequence alignment of types making up the population can be provided, and additional checks are performed." } }, "default": {}, @@ -1228,6 +1247,138 @@ }, "default": {}, "type": "object" + }, + "dehuman": { + "properties": { + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/dehuman.yaml" + }, + "mem": { + "type": "integer", + "default": 4096 + }, + "time": { + "type": "integer", + "default": 235 + }, + "threads": { + "type": "integer", + "default": 4 + }, + "ref_host": { + "type": "string", + "default": "references/human.fa.gz", + "description": "Host's genome used to remove reads (e.g. human genome)\n\n**Note:** if this file is absent, it is possible to fetch it from a remote server, see property `ref_host_url` below.", + "examples": ["/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"] + }, + "ref_host_url": { + "type": "string", + "default": "http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", + "description": "If the host's genome specified in property `ref_host` isn't present, fetch it from a remote server.\n\n**Note** remember to set aside enough memory for the indexing rule, see section `ref_bwa_index` property `mem`.", + "examples": [ "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz"] + }, + "keep_host": { + "type": "boolean", + "default": false, + "description": "Indicate whether to store the host-aligned reads in a CRAM file `…/alignments/host_aln.cram`.", + "examples": [true] + }, + "catchup": { + "type": "boolean", + "default": false, + "description": "Use this option when generating dehumanized raw reads (`dehuman.cram`) on old samples _**that have already**_ been processed in the past - a _catch up_.\n\nNormally, removing host-mapping reads requires analyzing reads which were rejected by V-pipe's main processing (as specified in section `general`, property `aligner`). But this output is considered temporary and will get deleted by Snakemake once the processing of a sample has finished. To generate `dehuman.cram` V-pipe would need to run the aligner again, which will both regenerate the data necessary for this output but also generate a new alignment which will trigger the whole workflow again.\nUse this property `catchup` to only generate the input necessary for `dehuman.cram`, leaving untouched the alignment and everything else that has already been processed.", + "examples": [true] + } + }, + "default": {}, + "type": "object" + }, + "checksum": { + "properties": { + "mem": { + "type": "integer", + "default": 256 + }, + "time": { + "type": "integer", + "default": 60 + }, + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/upload.yaml" + } + }, + "default": {}, + "type": "object" + }, + "upload": { + "description": "This section is used to assist and prepare uploads of the data, e.g. to European Nucleotide Archive. By default it calls a script that creates symlinks making it easy to identify new/updated samples between calls of V-pipe. But by using the properties `script` and `options` and adapting the environment provided in property `conda`, it is possible to heavily customize the actions (e.g. it is possible to upload to an SFTP server by calling `sftp` from a modified script). For inspiration, see the default script `prepare_upload_symlinks.sh`.", + "properties": { + "mem": { + "type": "integer", + "default": 256 + }, + "time": { + "type": "integer", + "default": 60 + }, + "conda": { + "type": "string", + "default": "{VPIPE_BASEDIR}/envs/upload.yaml", + "description": "The default environment only provides hashing functions (`xxhash`, linux coreutils' `sha`_{nnn}_`sum` collection, etc.) but depending on your needs you would want to provide a custom environment with additional tools (e.g. `sftp`, `rsync`, `curl`, `lftp`, custom specialized cloud uploaders, etc.)", + "example": ["custom_envs/ftp.yaml"] + }, + "threads": { + "type": "integer", + "default": 1 + }, + "local": { + "type": "boolean", + "default": true, + "description": "Don't dispatch the rule to the cluster for execution, run locally.", + "examples": [false] + }, + "consensus": { + "type": "string", + "default": "ambig", + "enum": ["ambig", "majority"], + "description": "When preparing data for upload, specifies which consensus sequence should be uploaded.", + "examples": ["majority"] + }, + "checksum": { + "type": "boolean", + "default": false, + "description": "Generate checksum for each individual consensus sequence (if a consensus is regenerated, it will help determine whether the new file has changed content or is virtually the same as the previous).", + "examples": [true] + }, + "orig_fastq": { + "type": "boolean", + "default": false, + "description": "Also include the original `.fastq.gz` sequencing reads files from `raw_data/` in the list of files to be uploaded. See property `orig_cram` below for a compressed version and see output `dehumanized_raw_reads` and section `dehuman` for depleting reads from the host.", + "examples": [true] + }, + "orig_cram": { + "type": "boolean", + "default": false, + "description": "Also include a compressed version of the original sequencing raw reads files from `raw_data/`. Similar to property `orig_fastq` above, but with reference-based compression.", + "examples": [true] + }, + "script": { + "type": "string", + "default": "{VPIPE_BASEDIR}/scripts/prepare_upload_symlinks.sh", + "description": "Custom script that assists and prepares uploads.\n\nIt will receive the following positional parameters:\n - __: the output file that must be created by the script.\n - __: a string (with no path separator slashes) that can be used as a name, uniquely identifying the sample and the date.\n - __: the base directory of the sample.\n - __...: a list of files to consider for upload\n\nFor an example, see the default script `prepare_upload_symlinks.sh`, it generates symlinks that help tracking which samples are new and/or updated between runs of V-pipe and thus should be considered for upload.", + "example": ["custom_scripts/uploader.py"] + }, + "options": { + "type": "string", + "default": "", + "description": "Named options to be passed to the script, before the positional parameters. E.g. for an extra configuration file with SFTP server information.", + "example": ["--config custom_scripts/uploader.yaml --"] + } + }, + "default": {}, + "type": "object" } } } diff --git a/workflow/scripts/prepare_upload_symlinks.sh b/workflow/scripts/prepare_upload_symlinks.sh new file mode 100755 index 000000000..46ac03e1d --- /dev/null +++ b/workflow/scripts/prepare_upload_symlinks.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash + +# +# Example script to prepare and assist uploads. +# - it will get called whenever the consensus FASTA files or the CRAM-compressed raw-reads are updated by snakemake +# - it will create a per-sample directory '.../uploads/' with symlinks to all uploadable files +# - it will create a global directory 'uploads' with symlinks to samples that were updated +# - these files aren't tracked by snakemake's DAG +# - thus they can be deleted without triggering a re-build by snakemake +# - but they will be re-created whenever an input file dependency changes +# - it is possible to iteratively scan this global directory between runs of V-pipe to determine +# which are new/updated samples to consider for upload +# - meanwhile the OUTPUT file is tracked by the snakemake DAG +# - it must be provided +# - its destruction will retrigger the prepare_upload rule and this script +# - it can optionally be used by the script to store arbitrary data (e.g. json) +# + +do_random_nonce=0 +exec_cmd= + +usage() { echo "Usage: $0 [ -h ] [ -n ] [ -e ] [ -- ] [ ... ] + +options: + -r : append a random nonce at the end of the global symlinks, + multiple updates of the same sample and date will generate + multiple unique symlinks, one for each update. + -R : no random nonce at the end of the global symlinks, + they will not be unique, a sample and date will always have + a single symlink, no matter how many time it was updated. + [ Default: random nonce are $( if (( do_random_nonce )); then echo "enabled"; else echo "disabled"; fi ) ] + + -e : run at the end of this script with the same positionals; + [ ... ] + it becomes that script's job to create the output file, + if successful. + + -- : end of options, start of positional parameters + +positional parameters: + : the output file that must be created by the rule + : a string (with no path separator slashes) that uniquely + identifies the sample and the date + : the base directory of the sample + : a list of files to consider for upload + +Generates symlinks that help tracking new and updated samples to consider +for upload. Serves also as a demo for the upload parameters of V-pipe." 1>&2; exit "$1"; } + +# NOTE it is possible to have named options (e.g. with getops) before the named options begin +while getopts "rRe:h" o; do + case "${o}" in + r) do_random_nonce=1 ;; + R) do_random_nonce=0 ;; + e) exec_cmd="${OPTARG}" ;; + h) usage 0 ;; + *) usage 1 ;; + esac +done +shift $((OPTIND-1)) + + + + +## get the positional parameters + +output="${1}" +sample_id="${2//\//_}" +sample_dir="${3}" +shift 3 +to_upload=( "${@}" ) + + + +## process + +# create and populate per-sample directory +mkdir -p "${sample_dir}/uploads/" + +for p in "${to_upload[@]}"; do + test -e "${p}" || continue + fixed_p="$(realpath --relative-to "${sample_dir}/uploads/" "${p}")" + ( set -x; ln -f -s "$fixed_p" "${sample_dir}/uploads/" ) +done + + +# create and populare global directory +mkdir -p uploads/ + +fixed_uploads="$(realpath --relative-to "uploads" "${sample_dir}/uploads/")" + +# make unique symbolic link: +if (( do_random_nonce )); then + read -r random o < <(dd if=/dev/urandom bs=30 count=1 2>/dev/null | sha1sum -b) + unique_id="${sample_id}__${random}" + force="" +else + unique_id="${sample_id}" + force="f" +fi + +( set -x; ln "-s${force}" "$fixed_uploads" "uploads/$unique_id" ) + + +# run command if asked to + +if [[ -n "${exec_cmd}" ]]; then + exec ${exec_cmd} "${output}" "${sample_id}" "${sample_dir}" "${to_upload[@]}" + echo "Failed to exec ${exec_cmd}" > /dev/stderr + exit 2 +fi + +# create the mandatory output file +touch "${output}" diff --git a/workflow/scripts/remove_reads_list.pl b/workflow/scripts/remove_reads_list.pl new file mode 100755 index 000000000..fa7c71b6e --- /dev/null +++ b/workflow/scripts/remove_reads_list.pl @@ -0,0 +1,29 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +# reads sequence ids from file from first command line argument +# then iterates over stdin and removes sequences for the given ids +# the sequences from stdin are organized in groups of three lines, the first line +# contains the sequence id + +my %h; # hash with 'headers' to filter-out as keys + +{ + my $listfile = shift @ARGV; + open my $fh, '<', $listfile + or die "cannot open ${listfile}"; + + # build a hash out of all (unique) seq ID + %h = map { chomp; ('@' . $_) => 1 } <$fh>; + + close $fh; +} +print STDERR "filtering out: ", scalar keys %h, " seq id\n"; + +while (<>) { # loop through headers... + my @s = map { scalar <> } 1 .. 3; # ...fetch the remainer 3 lines of read: bp sequence, separator, phred quality scores + print $_, @s + unless(exists $h{(split)[0]}); # consider header's first part ('@' + seq ID), ignore the second half ("comments", e.g.: Illumina indexes) +} diff --git a/workflow/scripts/report_sequences.py b/workflow/scripts/report_sequences.py new file mode 100644 index 000000000..a2c0dd2e2 --- /dev/null +++ b/workflow/scripts/report_sequences.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +import hashlib +import json +import pathlib +import shutil +import sys +import tempfile +import zipfile +from datetime import datetime, timezone + +from Bio import SeqIO +from Bio.SeqUtils.CheckSum import crc64, seguid + +LOCAL_TIMEZONE = datetime.now(timezone.utc).astimezone().tzinfo + +results = [] +raw_data_files = [] + +for f in sys.argv[1:]: + f = pathlib.Path(f) + _, sample, batch, *_ = f.parts + + timestamp = datetime.fromtimestamp(f.stat().st_ctime, tz=LOCAL_TIMEZONE) + sequences = [] + + dehuamized_raw_reads = f.parent.parent / "raw_uploads" / "dehuman.cram" + if dehuamized_raw_reads.exists(): + raw_data_files.append(dehuamized_raw_reads) + raw_data = str(dehuamized_raw_reads) + else: + raw_data = None + + for record in SeqIO.parse(f, "fasta"): + sequence = record.seq + sequences.append( + dict( + header=record.id, + seguid=seguid(sequence), + crc64=crc64(sequence), + sequence=str(sequence), + ) + ) + results.append( + dict( + sample=sample, + batch=batch, + created=str(timestamp), + sequences=sequences, + raw_data=raw_data, + ) + ) + + +folder = pathlib.Path(tempfile.mkdtemp()) + +with open(folder / "summary.json", "w") as fh: + json.dump(results, fh, indent=4) + +for p in raw_data_files: + target = folder / p + target.parent.mkdir(parents=True) + print(p, target) + shutil.copy(p, target) + +with zipfile.ZipFile("summary.zip", "w", zipfile.ZIP_DEFLATED) as zip_file: + for entry in folder.rglob("*"): + zip_file.write(entry, entry.relative_to(folder))