This repository has been archived by the owner on Sep 21, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path16S_usearch10.sh
executable file
·194 lines (161 loc) · 8.86 KB
/
16S_usearch10.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env zsh
# Prints welcome message and takes project name
welcome() {
echo "PE Illumina FASTQ to biom superscript with USEARCH v10 - active development"
echo "Developed for Z-shell (zsh)"
echo "--------------------------------------------------------"
echo "Dependencies:"
echo "The following commands need to be in the \$PATH variable, and named as such:"
echo "mothur - from www.mothur.org"
echo "uc2otutab.py - from www.drive5.com"
echo "biom - from biom-format.org"
echo "usearch10 - from www.drive5.com"
echo "Optional qiime1 integration (in Python 2)."
echo "--------------------------------------------------------"
echo "This script requires Python 2 due to the use of uc2otutab.py from drive5"
echo "Place all and only the paired-end Illumina files of the this project in this local folder."
echo "--------------------------------------------------------"
echo "Enter the project name that will be given to the output files as an identifier:"
read project
echo $(date +%Y-%m-%d\ %H:%M) "Project name: $project"
printf "\n"
read -q "relaxed?Do you want to relax the FASTQ merge pair criteria? [y/N]"
printf "\n"
read -q "qiime?Do you want to conduct downstream QIIME analysis? [y/N]"
printf "\n"
read -q "alpha?Do you want to keep singletons for alpha diversity analysis? [y/N]"
printf "\n"
read -q "denoise?Do you want to conduct UNOISE3 ZOTU analysis instead of 97% OTU clustering? [y/N]"
printf "\n"
}
# Picks representative OTU sequences using usearch10 and mothur, they need to be in $PATH
otu() {
echo $(date +%Y-%m-%d\ %H:%M) "Merging reads with 5 maximum mismatches or 5% maximum mismatch in the overlapped region."
if [[ "$relaxed" =~ ^[Yy]$ ]]; then
echo "Running relaxed paired-end FASTQ merging (-fastq_maxdiffs 10 -fastq_maxdiffpct 10 -fastq_trunctail 5)"
for file in *R1_001.fastq; do
if [ -f "$file" ]; then
sampleID=$(echo "$file" | cut -d "_" -f1) # "_" as delimiter
usearch10 -fastq_mergepairs $file -fastqout "${sampleID}_merged.fastq" -log "${sampleID}_merge.log"
fi
done
else
echo "Running standard paired-end FASTQ merging"
for file in *R1_001.fastq; do
if [ -f "$file" ]; then
sampleID=$(echo "$file" | cut -d "_" -f1) # "_" as delimiter
usearch10 -fastq_mergepairs $file -fastqout "${sampleID}_merged.fastq" -log "${sampleID}_merge.log"
fi
done
fi
echo $(date +%Y-%m-%d\ %H:%M) "Converting FASTQ to FASTA with 1.0 max expected error."
# Attempting to use the >SampleID.readnumber syntax here, SampleID will have to be alphanumeric
# Here @ = $sampleID as done by usearch (using _ and . delimiters)
# An alternative to -relabel is to use `-sample string` and append sample=string to the read label
for file in *merged.fastq; do
sampleID=$(echo "$file" | cut -d "_" -f1)
usearch10 -fastq_filter "${sampleID}_merged.fastq" -fastq_maxee 1.0 -fastqout "${sampleID}_filtered.fastq" -sample $sampleID -log ${sampleID}_filter.log
usearch10 -fastq_filter "${sampleID}_merged.fastq" -fastqout "${sampleID}_unfiltered.fastq" -sample $sampleID -log ${sampleID}_unfilter.log
done
# Concatenate the files in to a single file
for filename in *filtered.fastq; do
cat $filename >> "${project}_sequence.fastq"
done
for filename in *unfiltered.fastq; do
cat $filename >> "${project}_unfiltered_seq.fastq"
done
}
unoise() {
echo $(date +%Y-%m-%d\ %H:%M) "EXPERIMENTAL: Conducting UNOISE3 zero-radius otu analysis (http://www.drive5.com/usearch/manual/unoise_pipeline.html)."
time usearch10 -fastx_uniques "${project}_sequence.fastq" -sizeout -fastqout "${project}_uniques.fastq" -log uniques.log
# fastx_learn goes here (http://www.drive5.com/usearch/manual/cmd_fastx_learn.html)
# Note the change from -unoise to -unoise3 and change in syntax, confirmed unoise3 can take fastq input
# unoise3 automatically reamed the sequences to Zotu(x) therefore -relabel is not necessary (for otutab)
# Regarding warnings of "shifted sequences" (https://drive5.com/usearch/manual/otu_qc_offset.html)
if [[ "$alpha" =~ ^[Yy]$ ]]; then
echo "Warning: unoise3 with minsize=0 is highly experimental and will likely generate a large number of zotus."
echo "Use with caution!"
time usearch10 -unoise3 "${project}_uniques.fastq" -tabbedout unoise3.log -zotus "${project}_zotus.fasta" -minsize 0
else
time usearch10 -unoise3 "${project}_uniques.fastq" -tabbedout unoise3.log -zotus "${project}_zotus.fasta"
fi
# usearch_global is being replaced by otutab for this purpose
# time usearch10 -usearch_global "${project}_unfiltered_seq.fasta" -db "${project}_zotus.fasta" -id 1.00 -strand plus -uc "${project}_zotus.uc" -biomout "${project}_zotus.biom"
# Regarding the output options (http://drive5.com/usearch/manual/otutab_output.html): -otutabout (QIIME classic), -mapout (reads to OTU# list), -mothur_shared_out (share file)
time usearch10 -otutab "${project}_unfiltered_seq.fastq" -zotus "${project}_zotus.fasta" -biomout "${project}_zotus.json"
# Make copies of the required file for QIIME anlaysis in the anticipated file names
if [["$qiime" =~ ^[Yy]$ ]]; then
cp "${project}_zotus.fasta" "${project}_rep_numbered.fasta"
cp "${project}_zotus.json" "${project}.biom"
fi
}
normal(){
# Note singleton is not removed here
echo $(date +%Y-%m-%d\ %H:%M) "Dereplicating and sorting the unique reads by abundance."
usearch10 -fastx_uniques "${project}_sequence.fastq" -sizeout -fastaout "${project}_uniques.fasta" -log uniques.log
# fastx_learn goes here (http://www.drive5.com/usearch/manual/cmd_fastx_learn.html)
# Hardcoded mothur filtering parameters. This is mostly an insurance step to remove highly unusual sequences that are unlikely to be biological.
# usearch only does minlength discard, perhaps I will migrate the mothur command to vsearch (with min/maxseqlength)
# https://github.com/jimmyodonnell/banzai has an implementation with grep and awk
echo $(date +%Y-%m-%d\ %H:%M) "Sequence trimming, dereplication, singleton removal, and clustering."
mothur "#trim.seqs(fasta="${project}_uniques.fasta", minlength=200, maxlength=500, maxhomop=6)"
if [[ "$alpha" =~ ^[Yy]$ ]]; then
usearch10 -cluster_otus "${project}_uniques.trim.fasta" -minsize 0 -otus "${project}_rep_numbered.fasta" -relabel OTU
else
# No need to invoke an external python script to relabel the reads. Default minsize is 2, being explict here.
usearch10 -cluster_otus "${project}_uniques.trim.fasta" -minsize 2 -otus "${project}_rep_numbered.fasta" -relabel OTU
fi
# Keeping legacy command for reference purposes:
# time usearch9 -usearch_global "${project}_unfiltered_seq.fasta" -db "${project}_rep_numbered.fasta" -id 0.97 -strand plus -uc "${project}_readmap.uc" -biomout "${project}.biom"
time usearch10 -otutab "${project}_unfiltered_seq.fastq" -otus "${project}_rep_numbered.fasta" -biomout "${project}.biom"
}
cleanup() {
echo $(date +%Y-%m-%d\ %H:%M) "Cleaning up..."
mkdir log
mv *.log log/
mkdir original_sequences
mv *R1*fastq original_sequences/
mv *R2*fastq original_sequences/
mkdir intermediary_sequences
mv *.fasta intermediary_sequences/
mv *.fastq intermediary_sequences/
echo "Note that some FASTQ intermediary files are quite large, particuarly:" "${project}_unfiltered_seq.fastq"
cp "intermediary_sequences/${project}_rep_numbered.fasta" ./
}
qiime_analysis() {
printf "\n"
echo $(date +%Y-%m-%d\ %H:%M) "- Downstream analysis with QIIME."; printf "\n"
echo "Activating python2 envrionment for QIIME."; printf "\n"
source activate python2
echo $(date +%Y-%m-%d\ %H:%M) "Assigning taxonomy, this might take a while."
time assign_taxonomy.py -i "${project}_rep_numbered.fasta" -m rdp -o rdp_tax
echo $(date +%Y-%m-%d\ %H:%M) "Adding taxonomy to biom file, use this biom file for downstream analysis from here on."
time biom add-metadata --sc-separated taxonomy --observation-header OTUID,taxonomy --observation-metadata-fp "rdp_tax/${project}_rep_numbered_tax_assignments.txt" -i "${project}.biom" -o "${project}_tax.biom"
echo $(date +%Y-%m-%d\ %H:%M) "Making taxonomy stack bar graphs. Using the catchall 'summarize_taxa_through_plots.py' script"
time summarize_taxa_through_plots.py -i "${project}_tax.biom" -o "${project}_tax_summary"
echo $(date +%Y-%m-%d\ %H:%M) "Conducting three steps to produce a phylogenetic tree, for phylogeny-based analysis downstream (Unifrac, PD_whole_tree)."
echo "Aligning representative OTU sequence."
align_seqs.py -i "${project}_rep_numbered.fasta"
echo "Filtering the alignment."
filter_alignment.py -i "pynast_aligned/${project}_rep_numbered_aligned.fasta" -o pynast_aligned/
echo "Making the phylogenetic tree."
make_phylogeny.py -i "pynast_aligned/${project}_rep_numbered_aligned_pfiltered.fasta"
}
welcome
otu
if [[ "$denoise" =~ ^[Yy]$ ]]; then
echo "Running UNOISE3 ZOTU mode"
unoise
else
echo "Running 97% OTU clustering mode"
normal
fi
cleanup
if [[ "$alpha" =~ ^[Yy]$ ]] && [[ "$qiime" =~ ^[Yy]$ ]]; then
echo "Warning: You should not be running QIIME taxonomy summary with data containing singleton (for alpha diversity analysis)."
echo "QIIME taxonomic summary aborted!"
$qiime = "n"
fi
if [[ "$qiime" =~ ^[Yy]$ ]]; then
qiime_analysis
fi