From 8284cdd0dc593bf6988a68ca81bb2d8136d56de5 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 10 Mar 2017 17:10:25 +0000 Subject: [PATCH 1/5] Updated documentation to include description of output produced by each tool. In SECT stats output, replaced the header description describing 'bases' with 'kmers'. In SECT we now permit lowercase bases to be considered as valid K-mers. --- doc/source/using.rst | 117 +++++++++++++++++++++++++++++++--- lib/include/kat/str_utils.hpp | 4 ++ src/sect.cc | 2 +- 3 files changed, 114 insertions(+), 9 deletions(-) diff --git a/doc/source/using.rst b/doc/source/using.rst index 6a19a2f7..ae322ca1 100644 --- a/doc/source/using.rst +++ b/doc/source/using.rst @@ -27,6 +27,33 @@ Basic usage:: kat hist [options] ()+ +Output: + +Produces a histogram file and associated `Spectra hist`_ plot. +The histogram file starts with a header describing settings used to generate the +histogram. Each header line starts with a '#' character. The histogram that follows +describes each K-mer frequency followed by the number of distinct K-mers found +at that frequency separated by a space. Each frequency / count pair is line separated. +For example:: + + # Title:27-mer spectra for: SRR519624_1.1M.fastq + # XLabel:27-mer frequency + # YLabel:# distinct 27-mers + # Kmer value:27 + # Input 1:SRR519624_1.1M.fastq + ### + 1 47573743 + 2 4737789 + 3 732453 + 4 184505 + 5 100293 + 6 78699 + 7 68553 + 8 59589 + 9 50926 + ... + + Applications: * Assess data quality: estimates of kmers deriving from errors; sequencing bias @@ -51,6 +78,28 @@ Basic usage:: kat gcp ()+ +Output: + +Produces a matrix file and associated `Density`_ plot. The matrix file starts with +a header describing settings used to generate the matrix. Each header line starts +with a '#' character. The matrix that follows contains a space separated list of +distinct k-mer counts for the GC-count indexed by the row. Each column index represents +the K-mer Frequency. For example:: + + # Title:K-mer coverage vs GC count plot for: ERR409722_1.fastq ERR409722_2.fastq + # XLabel:K-mer multiplicity + # YLabel:GC count + # ZLabel:Distinct K-mers per bin + # Columns:1001 + # Rows:27 + # MaxVal:7705834 + # Transpose:0 + ### + 0 65392 10715 6038 4615 3769 3140 2690 2133 1748 1519 1370 1098 840 ... + 0 189337 30772 20040 16630 15579 13673 12809 11890 10380 9605 8403 7370 6302 ... + 0 428150 66753 41453 37478 34599 34622 34572 32740 31487 30356 28369 26880 ... + ... + Applications: * Assess data quality: estimates of kmers deriving from errors; sequencing bias @@ -88,11 +137,37 @@ an assembly:: kat comp -t 8 -o pe_v_asm_test 'PE1.R?.fq' asm.fa +... or if the reads are gzipped:: + + kat comp -t 8 -o pe_v_asm_test <(gunzip -c 'PE1.R?.fq') asm.fa + +Output: + +Produces a matrix file and by default a `Spectra CN`_ plot, although can also produce +a `Density` plot if requested. The matrix file is structured in a similar way to the +GCP tool with a header describing settings used to generate the matrix. Each header line starts +with a '#' character. The matrix that follows contains a space separated list of +distinct k-mer counts for the frequency in each input file represented by the row +and column index. For example:: + + # Title:K-mer comparison plot + # XLabel:K-mer multiplicity for: ERR409722_1.fastq + # YLabel:K-mer multiplicity for: ERR409722_2.fastq + # ZLabel:Distinct K-mers per bin + # Columns:1001 + # Rows:1001 + # MaxVal:57106148 + # Transpose:1 + ### + 0 57106148 2133673 428934 134189 45267 16399 6603 3374 2066 1371 930 752 490 ... + 50919938 10364720 1613532 607932 239439 89985 36398 16589 8811 5469 3369 ... + 1990321 1605550 1061952 561999 271443 125163 61769 34379 22459 15647 11171 ... + ... Applications: * Determine sequencing bias between left and right read pairs. - * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness + * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness. @@ -100,7 +175,7 @@ SECT ---- Estimates coverage levels across sequences in the provided input sequence file. -This tool will produce a fasta style representation of the input sequence file +This tool will produce a FastA style representation of the input sequence file containing K-mer coverage counts mapped across each sequence. K-mer coverage is determined from the provided counts input file, which can be either one jellyfish hash, or one or more FastA / FastQ files. In addition, a space separated table @@ -114,12 +189,38 @@ Basic usage:: kat sect [options] ()+ +Output: + +Produces a FastA-style representation of the K-mer frequency across each target sequence +as well as a file describing statistics for each target sequence. The FastA-style +output might look like this:: + + >Chr4 + 31 31 31 29 29 29 28 27 27 28 28 28 30 30 30 29 29 29 29 31 32 32 33 33 33 31 29 30 ... + +With an associated tab separated stats file which looks like this:: + + seq_name median mean gc% seq_length invalid_kmers %_invalid non_zero_bases %_non_zero %_non_zero_corrected + Chr4 26 362.45141 0.36204 18585056 3214 0.01729 18549840 99.81065 99.82792 + +The column headers have the following meaning: + * seq_name - The name of the FastA/Q sequence + * median - The median K-mer coverage across the sequence + * mean - The mean K-mer coverage across the sequence + * gc% - The GC% of the sequence + * seq_length - The length of the sequence + * invalid_kmers - The number of K-mers in the sequence that cannot be counted, most likely due to being non-canonical. i.e. non A,T,G,C + * %_invalid - The percentage of the sequence which contains invalid K-mers + * non_zero_kmers - The number of K-mers that have a coverage of 1 or greater + * %_non_zero - The percentage of the sequence which has a K-mer coverage greater than 1 + * %_non_zero_corrected - The percentage of the sequence which has a K-mer coverage greater than 1 but ignoring any parts of the sequence represented by invalid K-mers. + Applications: - * Analyse kmer coverage across assembled sequences - * Compare assemblies using kmers, helpful to levels of contamination of a specific organism. - * Contamination detection - Compare kmer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap + * Analyse K-mer coverage across assembled sequences + * Compare assemblies using K-mers, helpful to levels of contamination of a specific organism. + * Contamination detection - Compare K-mer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap Filtering tools @@ -173,7 +274,7 @@ modify axis, titles, limits, size, resolution, etc, although they will all try t intelligent defaults directly from the data provided. -Spectra_hist +Spectra hist ~~~~~~~~~~~~ Visualises the K-mer spectra from ``kat hist`` or ``jellyfish histo`` output. @@ -234,7 +335,7 @@ Applications: :scale: 66% -Spectra_CN +Spectra CN ~~~~~~~~~~ Shows K-mer duplication levels, which correspond to copy number variation within @@ -254,7 +355,7 @@ Applications: -Spectra_mx +Spectra MX ~~~~~~~~~~ Produces K-mer spectras from rows or columns in a matrix generated by ``kat comp``. diff --git a/lib/include/kat/str_utils.hpp b/lib/include/kat/str_utils.hpp index ad003371..f2a4bcb5 100644 --- a/lib/include/kat/str_utils.hpp +++ b/lib/include/kat/str_utils.hpp @@ -184,9 +184,13 @@ namespace kat { for (const auto& c : merstr) { switch (c) { case 'A': + case 'a': case 'T': + case 't': case 'G': + case 'g': case 'C': + case 'c': break; default: return false; diff --git a/src/sect.cc b/src/sect.cc index 292f5ad7..4f55eb39 100644 --- a/src/sect.cc +++ b/src/sect.cc @@ -186,7 +186,7 @@ void kat::Sect::processSeqFile() { // Average sequence coverage and GC% scores output stream ofstream cvg_gc_stream(string(outputPrefix.string() + "-stats.tsv").c_str()); - cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_bases\t%_invalid\tnon_zero_bases\t%_non_zero\t%_non_zero_corrected" << endl; + cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl; // Processes sequences in batches of records to reduce memory requirements while (!seqan::atEnd(reader)) { From 50ff0f7d0062b3f2efe28bdc52b52c4fbe224e21 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 10 Mar 2017 17:24:28 +0000 Subject: [PATCH 2/5] Tweaked wording in description of SECT in docs. --- doc/source/using.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/using.rst b/doc/source/using.rst index ae322ca1..20c70e44 100644 --- a/doc/source/using.rst +++ b/doc/source/using.rst @@ -176,11 +176,11 @@ SECT Estimates coverage levels across sequences in the provided input sequence file. This tool will produce a FastA style representation of the input sequence file -containing K-mer coverage counts mapped across each sequence. K-mer coverage is +containing K-mer coverage counts mapped across each sequence separated by spaces. +K-mer coverage is determined from the provided counts input file, which can be either one jellyfish -hash, or one or more FastA / FastQ files. In addition, a space separated table -file containing the mean coverage score and GC of each sequence is produced. The -row order is identical to the original sequence file. +hash, or one or more FastA / FastQ files. In addition, a file containing statistics +about each target sequence is produced. NOTE: K-mers containing any Ns derived from sequences in the sequence file not be included. @@ -200,7 +200,7 @@ output might look like this:: With an associated tab separated stats file which looks like this:: - seq_name median mean gc% seq_length invalid_kmers %_invalid non_zero_bases %_non_zero %_non_zero_corrected + seq_name median mean gc% seq_length invalid_kmers %_invalid non_zero_kmers %_non_zero %_non_zero_corrected Chr4 26 362.45141 0.36204 18585056 3214 0.01729 18549840 99.81065 99.82792 The column headers have the following meaning: From fd045e9295ea3243c7ffd2acfa3821ad38812bbf Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 10 Mar 2017 17:39:35 +0000 Subject: [PATCH 3/5] Added kmers in seq to SECT stats output. --- doc/source/using.rst | 5 +++-- src/sect.cc | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/using.rst b/doc/source/using.rst index 20c70e44..60899374 100644 --- a/doc/source/using.rst +++ b/doc/source/using.rst @@ -200,8 +200,8 @@ output might look like this:: With an associated tab separated stats file which looks like this:: - seq_name median mean gc% seq_length invalid_kmers %_invalid non_zero_kmers %_non_zero %_non_zero_corrected - Chr4 26 362.45141 0.36204 18585056 3214 0.01729 18549840 99.81065 99.82792 + seq_name median mean gc% seq_length kmers_in_seq invalid_kmers %_invalid non_zero_kmers %_non_zero %_non_zero_corrected + Chr4 26 362.45141 0.36204 18585056 18585036 3214 0.01729 18549840 99.81065 99.82792 The column headers have the following meaning: * seq_name - The name of the FastA/Q sequence @@ -209,6 +209,7 @@ The column headers have the following meaning: * mean - The mean K-mer coverage across the sequence * gc% - The GC% of the sequence * seq_length - The length of the sequence + * kmers_in_seq - The number of K-mers in the sequence. i.e. seq_length - K + 1 * invalid_kmers - The number of K-mers in the sequence that cannot be counted, most likely due to being non-canonical. i.e. non A,T,G,C * %_invalid - The percentage of the sequence which contains invalid K-mers * non_zero_kmers - The number of K-mers that have a coverage of 1 or greater diff --git a/src/sect.cc b/src/sect.cc index 4f55eb39..6f2b0636 100644 --- a/src/sect.cc +++ b/src/sect.cc @@ -186,7 +186,7 @@ void kat::Sect::processSeqFile() { // Average sequence coverage and GC% scores output stream ofstream cvg_gc_stream(string(outputPrefix.string() + "-stats.tsv").c_str()); - cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl; + cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tkmers_in_seq\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl; // Processes sequences in batches of records to reduce memory requirements while (!seqan::atEnd(reader)) { @@ -425,7 +425,8 @@ void kat::Sect::printStatTable(std::ostream &out) { << (*medians)[i] << "\t" << (*means)[i] << "\t" << (*gcs)[i] << "\t" - << (*lengths)[i] << "\t" + << (*lengths)[i] << "\t" + << (*lengths)[i] - this->input.merLen + 1 << "\t" << (*invalid)[i] << "\t" << (*percentInvalid)[i] << "\t" << (*nonZero)[i] << "\t" From 46288b88030196a8b725139273f2df5866a943de Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Mon, 13 Mar 2017 12:04:24 +0000 Subject: [PATCH 4/5] Added stack trace for seg faults. --- src/kat.cc | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/kat.cc b/src/kat.cc index 74f7c2dc..a1580739 100644 --- a/src/kat.cc +++ b/src/kat.cc @@ -19,6 +19,8 @@ #include #endif +#include +#include #include #include #include @@ -114,6 +116,21 @@ const string helpMessage() { } +void handler(int sig) { + void *array[10]; + size_t size; + + // get void*'s for all entries on the stack + size = backtrace(array, 10); + + // print out all the frames to stderr + fprintf(stderr, "Error: signal %d:\n", sig); + fprintf(stderr, "Stack trace:\n"); + backtrace_symbols_fd(array, size, STDERR_FILENO); + exit(1); +} + + /** * Start point for KAT. Processes the start of the command line and then @@ -121,6 +138,7 @@ const string helpMessage() { */ int main(int argc, char *argv[]) { + signal(SIGSEGV, handler); try { // KAT args string modeStr; @@ -224,11 +242,11 @@ int main(int argc, char *argv[]) } } catch(po::error& e) { - cerr << "Error: Parsing Command Line: " << e.what() << endl; - return 1; + cerr << "Error: Parsing Command Line: " << e.what() << endl; + return 1; } catch (boost::exception &e) { - cerr << boost::diagnostic_information(e); + cerr << boost::diagnostic_information(e); return 4; } catch (exception& e) { cerr << "Error: " << e.what() << endl; From 17a8ec10b3ca3c4ac0f3dca26c4106213a999584 Mon Sep 17 00:00:00 2001 From: Daniel Mapleson Date: Fri, 7 Apr 2017 15:08:35 +0100 Subject: [PATCH 5/5] Fixed a bug which was causing a seg fault when using the comp tool with bin sizes larger than the default value. --- configure.ac | 2 +- doc/source/conf.py | 4 ++-- lib/include/kat/comp_counters.hpp | 4 ++++ lib/include/kat/sparse_matrix.hpp | 2 +- lib/src/comp_counters.cc | 4 ++++ src/comp.cc | 3 +-- 6 files changed, 13 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index 36f4416b..8e70640d 100644 --- a/configure.ac +++ b/configure.ac @@ -4,7 +4,7 @@ # Autoconf initialistion. Sets package name version and contact details AC_PREREQ([2.68]) -AC_INIT([kat],[2.3.2],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT]) +AC_INIT([kat],[2.3.3],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT]) AC_CONFIG_SRCDIR([src/kat.cc]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) diff --git a/doc/source/conf.py b/doc/source/conf.py index 0816bdf2..a7d5e020 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -52,9 +52,9 @@ # built documents. # # The short X.Y version. -version = '2.3.2' +version = '2.3.3' # The full version, including alpha/beta/rc tags. -release = '2.3.2' +release = '2.3.3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/lib/include/kat/comp_counters.hpp b/lib/include/kat/comp_counters.hpp index e482a88a..5ce341d4 100644 --- a/lib/include/kat/comp_counters.hpp +++ b/lib/include/kat/comp_counters.hpp @@ -57,6 +57,8 @@ class CompCounters { path hash3_path; CompCounters(); + + CompCounters(const size_t _dm_size); CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size); @@ -92,6 +94,8 @@ class ThreadedCompCounters { public: ThreadedCompCounters(); + + ThreadedCompCounters(const size_t _dm_size); ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size); diff --git a/lib/include/kat/sparse_matrix.hpp b/lib/include/kat/sparse_matrix.hpp index 7e9423c0..fed458a9 100644 --- a/lib/include/kat/sparse_matrix.hpp +++ b/lib/include/kat/sparse_matrix.hpp @@ -323,7 +323,7 @@ class ThreadedSparseMatrix { } const SM64& mergeThreadedMatricies() { - // Merge matrix + // Merge matrix for (int i = 0; i < width; i++) { for (int j = 0; j < height; j++) { for (int k = 0; k < threads; k++) { diff --git a/lib/src/comp_counters.cc b/lib/src/comp_counters.cc index a219fb5d..37ab8ffa 100644 --- a/lib/src/comp_counters.cc +++ b/lib/src/comp_counters.cc @@ -39,6 +39,8 @@ using kat::DistanceMetric; kat::CompCounters::CompCounters() : CompCounters("", "", "", DEFAULT_NB_BINS) {} +kat::CompCounters::CompCounters(const size_t _dm_size) : CompCounters("", "", "", _dm_size) {} + kat::CompCounters::CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) : hash1_path(_hash1_path), hash2_path(_hash2_path), hash3_path(_hash3_path) { @@ -208,6 +210,8 @@ void kat::CompCounters::printCounts(ostream &out) { kat::ThreadedCompCounters::ThreadedCompCounters() : ThreadedCompCounters("", "", "", DEFAULT_NB_BINS) {} +kat::ThreadedCompCounters::ThreadedCompCounters(const size_t _dm_size) : ThreadedCompCounters("", "", "", _dm_size) {} + kat::ThreadedCompCounters::ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) { final_matrix = CompCounters(_hash1_path, _hash2_path, _hash3_path, _dm_size); } diff --git a/src/comp.cc b/src/comp.cc index 8664fe6f..41813776 100644 --- a/src/comp.cc +++ b/src/comp.cc @@ -253,7 +253,6 @@ void kat::Comp::merge() { cout << "Merging results ..."; cout.flush(); - // Merge results from the threads main_matrix.mergeThreadedMatricies(); if (doThirdHash()) { @@ -390,7 +389,7 @@ void kat::Comp::compare() { void kat::Comp::compareSlice(int th_id) { - shared_ptr cc = make_shared(); + shared_ptr cc = make_shared(std::min(this->d1Bins, this->d2Bins)); // Setup iterator for this thread's chunk of hash1 LargeHashArray::eager_iterator hash1Iterator = input[0].hash->eager_slice(th_id, threads);