From 9a56fd7d91d2fbbfb624b830dc9903c02bf8d871 Mon Sep 17 00:00:00 2001 From: ryan Date: Mon, 9 Mar 2020 12:04:26 -0400 Subject: [PATCH] updated example projects and README files --- README.md | 5 +++-- composer.py | 20 +++++++++++--------- examples/project1/conf.py | 8 ++++---- examples/project2/conf.py | 4 ++-- examples/project3/adapters.txt | 1 + examples/project3/conf.py | 6 ++++-- tools/README.md | 8 +++++++- 7 files changed, 32 insertions(+), 20 deletions(-) create mode 100644 examples/project3/adapters.txt diff --git a/README.md b/README.md index 2c269d3..9dd73e5 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ Using a text editor, save a file containing any of the following variables as a |non_genomic|number of non-genomic bases not found in barcode sequence (e.g. 'T' complementary to A-tailing library prep)|integer| |end_score|end-trim once entire window >= this Q score|integer between 0 and 40| |window|size of window to test for >= end_trim|integer within read length| -|min_l|minimum read length to retain for end-trimming and adapter removal|integer > 0| +|min_len|minimum read length to retain for end-trimming and adapter removal|integer > 0| |q_min|Q score minimum (Phred value 0-40) applied to q_percent variable|integer between 0 and 40| |q_percent|percentage of reads >= q_min Q scores|number between 0 and 100| |adapter_match|number of base matches to identify adapters (requires 'adapters.txt')|integer (recommend 12)| @@ -125,6 +125,7 @@ R2_bases_ls = ['TCC', 'TCT'] non_genomic = 1 end_score = 30 window = 10 +min_len = 50 q_min = 30 q_percent = 95 ``` @@ -134,7 +135,7 @@ q_percent = 95 *In this case, samples were double-digested with AluI and HaeIII and A-tailed before adapter ligation (**R1_bases_ls = ['TCC', 'TCT']** and **R2_bases_ls = ['TCC', 'TCT']**). Only reads containing these motifs will pass to subsequent steps. As the T complement from A-tailing introduces an artificial residue not present in the specimen sequenced, it can simultaneously be removed alongside motif detection (**non_genomic = 1**).* -*Automatic end-trimming will be performed based on Q score. Here, groups of bases are considered within a moving window of 10 bases at a time (**window = 10**) until that window consists only of the desired Q score at or above 30 (**end_score = 30**). It is at this point that the read is trimmed.* +*Automatic end-trimming will be performed based on Q score. Here, groups of bases are considered within a moving window of 10 bases at a time (**window = 10**) until that window consists only of the desired Q score at or above 30 (**end_score = 30**). It is at this point that the read is trimmed. Reads that are less than 50 bp will be discarded (**min_len = 50**)* *Only reads that have a Q score of 30 (**q_min = 30**) acrosss at least 95 percent of the read (**q_percent = 95**) will pass to subsequent steps. If a R1 read or an R2 read passes while its partner fails, it will be placed into a single-end read subfolder and the failing read will be discarded.* diff --git a/composer.py b/composer.py index 6f60a3f..130b42a 100644 --- a/composer.py +++ b/composer.py @@ -48,7 +48,7 @@ def __init__(self): self.non_genomic = False self.end_score = False self.window = False - self.min_l = 0 + self.min_len = 1 self.adapters = '' self.adapter_match = False self.q_min = False @@ -501,11 +501,11 @@ def scallop_end_multi(): curr = dir_make('end_trimmed') paired_setup(curr) scallop_part = partial(scallop_comp, c.in1_ls, c.in2_ls, None, None, - c.end_score, c.window, c.min_l, curr) + c.end_score, c.window, c.min_len, curr) pool_multi(scallop_part, c.in1_ls) if c.singles_ls: scallop_part = partial(scallop_comp, [], [], None, None, c.end_score, - c.window, c.min_l, curr) + c.window, c.min_len, curr) pool_multi(scallop_part, c.singles_ls) paired_takedown(curr) temp_ls = pathfinder(curr) @@ -513,7 +513,7 @@ def scallop_end_multi(): temp_ls = walkthrough(curr, scallop_end_multi, temp_ls, end_score=c.end_score, window=c.window, - min_l=c.min_l) + min_len=c.min_len) return temp_ls @@ -524,11 +524,11 @@ def porifera_multi(): curr = dir_make('adapted') paired_setup(curr) porifera_part = partial(porifera_comp, curr, c.in1_ls, c.in2_ls, - c.adapters, c.bcs_dict, c.adapter_match, c.min_l) + c.adapters, c.bcs_dict, c.adapter_match, c.min_len) pool_multi(porifera_part, c.in1_ls) if c.singles_ls: porifera_part = partial(porifera_comp, curr, [], [], c.adapters, - c.bcs_dict, c.adapter_match, c.min_l) + c.bcs_dict, c.adapter_match, c.min_len) pool_multi(porifera_part, c.singles_ls) paired_takedown(curr) temp_ls = pathfinder(curr) @@ -661,7 +661,9 @@ def tidy_up(): def summary_file(): end_time = str(datetime.datetime.now()).split('.')[0] - log = ('ngscomposer version ' + version + '\n\n' + + log = ('ngscomposer version ' + version + '\n' + + 'see https://github.com/ryandkuster/ngscomposer/releases '\ + 'for newest release info\n\n' + 'start ' + c.start_time + '\n' + 'end ' + end_time + '\n\n' + 'paired = ' + str(c.paired) + '\n' + @@ -679,7 +681,7 @@ def summary_file(): 'non_genomic = ' + str(c.non_genomic) + '\n' + 'end_score = ' + str(c.end_score) + '\n' + 'window = ' + str(c.window) + '\n' + - 'min_l = ' + str(c.min_l) + '\n' + + 'min_len = ' + str(c.min_len) + '\n' + 'adapters = ' + str(c.adapters) + '\n' + 'adapter_match = ' + str(c.adapter_match) + '\n' + 'q_min = ' + str(c.q_min) + '\n' + @@ -707,7 +709,7 @@ def summary_file(): ' rotifer.py - motif detection\n' + ' porifera.py - adapter removal\n\n' + ' krill.py - quality filtering\n' + - 'see https://github.com/ryandkuster/ngs-composer for full usage notes\n\n' + + 'see https://github.com/ryandkuster/ngscomposer for full usage notes\n\n' + ''), formatter_class=RawTextHelpFormatter) parser.add_argument('-i', type=str, required=True, help='the full or relative path to the project directory') diff --git a/examples/project1/conf.py b/examples/project1/conf.py index b376dad..8f66156 100644 --- a/examples/project1/conf.py +++ b/examples/project1/conf.py @@ -2,7 +2,7 @@ procs = 1 alt_dir = False initial_qc = True -all_qc = False +all_qc = 'summary' walkaway = True front_trim = 6 mismatch = 1 @@ -10,7 +10,7 @@ R2_bases_ls = ['TCC', 'TCT'] non_genomic = 1 q_min = 30 -q_percent = 95 -trim_mode = False -auto_trim = False +q_percent = 90 +end_score = 30 +window = 10 rm_transit = True diff --git a/examples/project2/conf.py b/examples/project2/conf.py index bbcfdaa..e25277a 100644 --- a/examples/project2/conf.py +++ b/examples/project2/conf.py @@ -8,6 +8,6 @@ mismatch = 1 q_min = 30 q_percent = 95 -trim_mode = 'quartile' -auto_trim = 30 +end_score = 30 +window = 10 rm_transit = False diff --git a/examples/project3/adapters.txt b/examples/project3/adapters.txt new file mode 100644 index 0000000..327dc3d --- /dev/null +++ b/examples/project3/adapters.txt @@ -0,0 +1 @@ +ACACTCTTTCCCTACACGACGCTCTTCCGATCT diff --git a/examples/project3/conf.py b/examples/project3/conf.py index e461625..b41498c 100644 --- a/examples/project3/conf.py +++ b/examples/project3/conf.py @@ -9,8 +9,10 @@ R1_bases_ls = ['TCC', 'TCT'] R2_bases_ls = ['TCC', 'TCT'] non_genomic = 1 +end_score = 30 +window = 10 +min_len = 100 +adapter_match = 12 q_min = 30 q_percent = 95 -trim_mode = 'quartile' -auto_trim = 30 rm_transit = True diff --git a/tools/README.md b/tools/README.md index 692e472..22f71c0 100644 --- a/tools/README.md +++ b/tools/README.md @@ -46,6 +46,12 @@ Example: $ python3 scallop.py -r1 1_R1.fastq -f 6 ``` +or + +```bash +$ python3 scallop.py -r1 1_R1.fastq -w 10 -e 30 -l 50 +``` + The output files are automatically named with "trimmed" prefix (e.g. "trimmed.1_R1.fastq") ## Anemone - demultiplexing of single-end or paired-end barcoded libraries @@ -143,7 +149,7 @@ As with Krill, paired-end output files will indicate when pairing has been retai Example: ```bash -$ python3 porifera.py -r1 1_R1.fastq -a1 adapters.txt -n 18 -m 3 +$ python3 porifera.py -r1 1_R1.fastq -a1 adapters.txt -m 12 -k 8 -r 1 ``` Example adapter file: