Skip to content

Commit

Permalink
(Partial) support for on-line Host genome fetching
Browse files Browse the repository at this point in the history
 - Can use an URL for host genome (Homo Sapiens)
 - re-organising bwa indexing code to support this
 - can still crash in various circumstance, considered unstable
  • Loading branch information
DrYak committed Mar 11, 2022
1 parent 79dde0a commit 0dd9c72
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 41 deletions.
2 changes: 1 addition & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ rule allfastqc:
fastqc_files,


include: "rules/clean.smk"
include: "rules/quality_assurance.smk"
include: "rules/align.smk"
include: "rules/consensus.smk"
include: "rules/mafs.smk"
include: "rules/stats.smk"
include: "rules/publish.smk"
include: "rules/dehuman.smk"
include: "rules/clean.smk"


if config.output["snv"] or config.output["local"]:
Expand Down
37 changes: 13 additions & 24 deletions workflow/rules/align.smk
Original file line number Diff line number Diff line change
Expand Up @@ -444,20 +444,24 @@ rule sam2bam:
"""


bwa_idx_ext = [".bwt", ".amb", ".ann", ".pac", ".sa"]
bowtie_idx_ext = [".1.bt2", ".2.bt2", ".3.bt2", ".4.bt2", ".rev.1.bt2", ".rev.2.bt2"]


rule ref_bwa_index:
input:
reference_file,
"{file}",
output:
"{}.bwt".format(reference_file), #all indexing files: .amb .ann .bwt .fai .pac .sa
multiext("{file}", *bwa_idx_ext),
params:
BWA=config.applications["bwa"],
log:
outfile="references/bwa_index.out.log",
errfile="references/bwa_index.err.log",
outfile="{file}_bwa_index.out.log",
errfile="{file}_bwa_index.err.log",
conda:
config.ref_bwa_index["conda"]
benchmark:
"references/ref_bwa_index.benchmark"
"{file}_bwa_index.benchmark"
group:
"align"
resources:
Expand Down Expand Up @@ -488,7 +492,7 @@ if config.general["aligner"] == "bwa":
input:
FASTQ=input_align_gz,
REF=reference_file,
INDEX="{}.bwt".format(reference_file),
INDEX=multiext(reference_file, *bwa_idx_ext),
output:
REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"),
TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"),
Expand Down Expand Up @@ -525,12 +529,7 @@ elif config.general["aligner"] == "bowtie":
input:
reference_file,
output:
INDEX1="{}.1.bt2".format(reference_file),
INDEX2="{}.2.bt2".format(reference_file),
INDEX3="{}.3.bt2".format(reference_file),
INDEX4="{}.4.bt2".format(reference_file),
INDEX5="{}.rev.1.bt2".format(reference_file),
INDEX6="{}.rev.2.bt2".format(reference_file),
multiext(reference_file, *bowtie_idx_ext),
params:
BOWTIE=config.applications["bowtie_idx"],
log:
Expand Down Expand Up @@ -560,12 +559,7 @@ elif config.general["aligner"] == "bowtie":
R1="{dataset}/preprocessed_data/R1.fastq.gz",
R2="{dataset}/preprocessed_data/R2.fastq.gz",
REF=reference_file,
INDEX1="{}.1.bt2".format(reference_file),
INDEX2="{}.2.bt2".format(reference_file),
INDEX3="{}.3.bt2".format(reference_file),
INDEX4="{}.4.bt2".format(reference_file),
INDEX5="{}.rev.1.bt2".format(reference_file),
INDEX6="{}.rev.2.bt2".format(reference_file),
INDEX=multiext(reference_file, *bowtie_idx_ext),
output:
REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"),
TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"),
Expand Down Expand Up @@ -604,12 +598,7 @@ elif config.general["aligner"] == "bowtie":
input:
R1="{dataset}/preprocessed_data/R1.fastq.gz",
REF=reference_file,
INDEX1="{}.1.bt2".format(reference_file),
INDEX2="{}.2.bt2".format(reference_file),
INDEX3="{}.3.bt2".format(reference_file),
INDEX4="{}.4.bt2".format(reference_file),
INDEX5="{}.rev.1.bt2".format(reference_file),
INDEX6="{}.rev.2.bt2".format(reference_file),
INDEX=multiext(reference_file, *bowtie_idx_ext),
output:
REF=temp_with_prefix("{dataset}/alignments/REF_aln.sam"),
TMP_SAM=temp_with_prefix("{dataset}/alignments/tmp_aln.sam"),
Expand Down
9 changes: 2 additions & 7 deletions workflow/rules/clean.smk
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ rule alignclean:

rule bwaclean:
input:
"{}.bwt".format(reference_file),
INDEX=multiext(reference_file, *bwa_idx_ext),
params:
DIR=config.output["datadir"],
shell:
Expand All @@ -74,12 +74,7 @@ rule bwaclean:

rule bowtieclean:
input:
INDEX1="{}.1.bt2".format(reference_file),
INDEX2="{}.2.bt2".format(reference_file),
INDEX3="{}.3.bt2".format(reference_file),
INDEX4="{}.4.bt2".format(reference_file),
INDEX5="{}.rev.1.bt2".format(reference_file),
INDEX6="{}.rev.2.bt2".format(reference_file),
INDEX=multiext(reference_file, *bowtie_idx_ext),
params:
DIR=config.output["datadir"],
shell:
Expand Down
8 changes: 4 additions & 4 deletions workflow/rules/dehuman.smk
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ rule dh_redo_alignreject:
# (useful if aligner's (temp) rejects have been deleted by now)
input:
global_ref=reference_file,
ref_index="{}.bwt".format(reference_file),
ref_index=multiext(reference_file, *bwa_idx_ext),
fastq=input_align_gz,
output:
tmp_aln=temp_with_prefix("{dataset}/alignments/dh_aln.sam"),
Expand Down Expand Up @@ -85,8 +85,8 @@ rule dh_redo_alignreject:

rule dh_hostalign:
input:
host_ref=config.dehuman["ref_host"],
ref_index="{}.bwt".format(config.dehuman["ref_host"]),
host_ref=cachepath(config.dehuman["ref_host"]),
ref_index=multiext(cachepath(config.dehuman["ref_host"]), *bwa_idx_ext),
reject_1=rules.dh_redo_alignreject.output.reject_1
if config["dehuman"]["catchup"]
else rules.dh_reuse_alignreject.output.reject_1,
Expand Down Expand Up @@ -244,7 +244,7 @@ rule dh_filter:
rule dehuman:
input:
global_ref=reference_file,
ref_index="{}.bwt".format(reference_file),
ref_index=multiext(reference_file, *bwa_idx_ext),
filtered_1=rules.dh_filter.output.filtered_1, # =temp_prefix("{dataset}/raw_uploads/filtered_1.fastq.gz"),
filtered_2=rules.dh_filter.output.filtered_2, # =temp_prefix("{dataset}/raw_uploads/filtered_2.fastq.gz"),
output:
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/publish.smk
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ rule prepare_upload:
rule unfiltered_cram:
input:
global_ref=reference_file,
ref_index="{}.bwt".format(reference_file),
# TODO switch to output of rule rule extract
ref_index=multiext(reference_file, *bwa_idx_ext),
# NOTE not relying on rule extract - if done as a catchup these might not exist
R1=partial(raw_data_file, pair=1),
R2=partial(raw_data_file, pair=2),
output:
Expand Down
6 changes: 3 additions & 3 deletions workflow/schemas/config_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1268,9 +1268,9 @@
},
"ref_host": {
"type": "string",
"default": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz",
"description": "Host's genome used to remove reads (e.g. human genome)",
"examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa"]
"default": "http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
"description": "Host's genome used to remove reads (e.g. human genome)\n\n**Note:** although directly fetching the reference from the network can work, we _strongly advise_ to use local files (e.g., download the reference locally or use one provided by your cluster).",
"examples": ["references/human.fa", "/cluster/project/igenomes/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz"]
},
"keep_host": {
"type": "boolean",
Expand Down

0 comments on commit 0dd9c72

Please sign in to comment.