(Partial) support for on-line Host genome fetching

- Can use an URL for host genome (Homo Sapiens) - re-organising bwa indexing code to support this - can still crash in various circumstance, considered unstable
cbg-ethz · Mar 11, 2022 · 0dd9c72 · 0dd9c72
1 parent 79dde0a
commit 0dd9c72
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 41 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -31,14 +31,14 @@ rule allfastqc:
         fastqc_files,
 
 
-include: "rules/clean.smk"
 include: "rules/quality_assurance.smk"
 include: "rules/align.smk"
 include: "rules/consensus.smk"
 include: "rules/mafs.smk"
 include: "rules/stats.smk"
 include: "rules/publish.smk"
 include: "rules/dehuman.smk"
+include: "rules/clean.smk"
 
 
 if config.output["snv"] or config.output["local"]:

diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
@@ -444,20 +444,24 @@ rule sam2bam:
         """
 
 
+bwa_idx_ext = [".bwt", ".amb", ".ann", ".pac", ".sa"]
+bowtie_idx_ext = [".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", ".rev.1.bt2", ".rev.2.bt2"]
+
+
 rule ref_bwa_index:
     input:
-        reference_file,
+        "{file}",
     output:
-        "{}.bwt".format(reference_file),  #all indexing files: .amb  .ann  .bwt  .fai  .pac  .sa
+        multiext("{file}", *bwa_idx_ext),
     params:
         BWA=config.applications["bwa"],
     log:
-        outfile="references/bwa_index.out.log",
-        errfile="references/bwa_index.err.log",
+        outfile="{file}_bwa_index.out.log",
+        errfile="{file}_bwa_index.err.log",
     conda:
         config.ref_bwa_index["conda"]
     benchmark:
-        "references/ref_bwa_index.benchmark"
+        "{file}_bwa_index.benchmark"
     group:
         "align"
     resources:
@@ -488,7 +492,7 @@ if config.general["aligner"] == "bwa":
         input:
             FASTQ=input_align_gz,
             REF=reference_file,
-            INDEX="{}.bwt".format(reference_file),
+            INDEX=multiext(reference_file, *bwa_idx_ext),
         output:
             REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"),
             TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"),
@@ -525,12 +529,7 @@ elif config.general["aligner"] == "bowtie":
         input:
             reference_file,
         output:
-            INDEX1="{}.1.bt2".format(reference_file),
-            INDEX2="{}.2.bt2".format(reference_file),
-            INDEX3="{}.3.bt2".format(reference_file),
-            INDEX4="{}.4.bt2".format(reference_file),
-            INDEX5="{}.rev.1.bt2".format(reference_file),
-            INDEX6="{}.rev.2.bt2".format(reference_file),
+            multiext(reference_file, *bowtie_idx_ext),
         params:
             BOWTIE=config.applications["bowtie_idx"],
         log:
@@ -560,12 +559,7 @@ elif config.general["aligner"] == "bowtie":
                 R1="{dataset}/preprocessed_data/R1.fastq.gz",
                 R2="{dataset}/preprocessed_data/R2.fastq.gz",
                 REF=reference_file,
-                INDEX1="{}.1.bt2".format(reference_file),
-                INDEX2="{}.2.bt2".format(reference_file),
-                INDEX3="{}.3.bt2".format(reference_file),
-                INDEX4="{}.4.bt2".format(reference_file),
-                INDEX5="{}.rev.1.bt2".format(reference_file),
-                INDEX6="{}.rev.2.bt2".format(reference_file),
+                INDEX=multiext(reference_file, *bowtie_idx_ext),
             output:
                 REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"),
                 TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"),
@@ -604,12 +598,7 @@ elif config.general["aligner"] == "bowtie":
             input:
                 R1="{dataset}/preprocessed_data/R1.fastq.gz",
                 REF=reference_file,
-                INDEX1="{}.1.bt2".format(reference_file),
-                INDEX2="{}.2.bt2".format(reference_file),
-                INDEX3="{}.3.bt2".format(reference_file),
-                INDEX4="{}.4.bt2".format(reference_file),
-                INDEX5="{}.rev.1.bt2".format(reference_file),
-                INDEX6="{}.rev.2.bt2".format(reference_file),
+                INDEX=multiext(reference_file, *bowtie_idx_ext),
             output:
                 REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"),
                 TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"),

diff --git a/workflow/rules/clean.smk b/workflow/rules/clean.smk
@@ -60,7 +60,7 @@ rule alignclean:
 
 rule bwaclean:
     input:
-        "{}.bwt".format(reference_file),
+        INDEX=multiext(reference_file, *bwa_idx_ext),
     params:
         DIR=config.output["datadir"],
     shell:
@@ -74,12 +74,7 @@ rule bwaclean:
 
 rule bowtieclean:
     input:
-        INDEX1="{}.1.bt2".format(reference_file),
-        INDEX2="{}.2.bt2".format(reference_file),
-        INDEX3="{}.3.bt2".format(reference_file),
-        INDEX4="{}.4.bt2".format(reference_file),
-        INDEX5="{}.rev.1.bt2".format(reference_file),
-        INDEX6="{}.rev.2.bt2".format(reference_file),
+        INDEX=multiext(reference_file, *bowtie_idx_ext),
     params:
         DIR=config.output["datadir"],
     shell:

diff --git a/workflow/rules/dehuman.smk b/workflow/rules/dehuman.smk
@@ -43,7 +43,7 @@ rule dh_redo_alignreject:
     # (useful if aligner's (temp) rejects have been deleted by now)
     input:
         global_ref=reference_file,
-        ref_index="{}.bwt".format(reference_file),
+        ref_index=multiext(reference_file, *bwa_idx_ext),
         fastq=input_align_gz,
     output:
         tmp_aln=temp_with_prefix("{dataset}/alignments/dh_aln.sam"),
@@ -85,8 +85,8 @@ rule dh_redo_alignreject:
 
 rule dh_hostalign:
     input:
-        host_ref=config.dehuman["ref_host"],
-        ref_index="{}.bwt".format(config.dehuman["ref_host"]),
+        host_ref=cachepath(config.dehuman["ref_host"]),
+        ref_index=multiext(cachepath(config.dehuman["ref_host"]), *bwa_idx_ext),
         reject_1=rules.dh_redo_alignreject.output.reject_1
         if config["dehuman"]["catchup"]
         else rules.dh_reuse_alignreject.output.reject_1,
@@ -244,7 +244,7 @@ rule dh_filter:
 rule dehuman:
     input:
         global_ref=reference_file,
-        ref_index="{}.bwt".format(reference_file),
+        ref_index=multiext(reference_file, *bwa_idx_ext),
         filtered_1=rules.dh_filter.output.filtered_1,  # =temp_prefix("{dataset}/raw_uploads/filtered_1.fastq.gz"),
         filtered_2=rules.dh_filter.output.filtered_2,  # =temp_prefix("{dataset}/raw_uploads/filtered_2.fastq.gz"),
     output:

diff --git a/workflow/rules/publish.smk b/workflow/rules/publish.smk
@@ -68,8 +68,8 @@ rule prepare_upload:
 rule unfiltered_cram:
     input:
         global_ref=reference_file,
-        ref_index="{}.bwt".format(reference_file),
-        # TODO switch to output of rule rule extract
+        ref_index=multiext(reference_file, *bwa_idx_ext),
+        # NOTE not relying on rule extract - if done as a catchup these might not exist
         R1=partial(raw_data_file, pair=1),
         R2=partial(raw_data_file, pair=2),
     output:

diff --git a/workflow/schemas/config_schema.json b/workflow/schemas/config_schema.json
@@ -1268,9 +1268,9 @@
                 },
                 "ref_host": {
                     "type": "string",
-                    "default": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz",
-                    "description": "Host's genome used to remove reads (e.g. human genome)",
-                    "examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"]
+                    "default": "http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
+                    "description": "Host's genome used to remove reads (e.g. human genome)\n\n**Note:** although directly fetching the reference from the network can work, we _strongly advise_ to use local files (e.g., download the reference locally or use one provided by your cluster).",
+                    "examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz"]
                 },
                 "keep_host": {
                     "type": "boolean",