From 8284cdd0dc593bf6988a68ca81bb2d8136d56de5 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 10 Mar 2017 17:10:25 +0000
Subject: [PATCH 1/5] Updated documentation to include description of output
 produced by each tool.

In SECT stats output, replaced the header description describing 'bases' with 'kmers'.

In SECT we now permit lowercase bases to be considered as valid K-mers.
---
 doc/source/using.rst          | 117 +++++++++++++++++++++++++++++++---
 lib/include/kat/str_utils.hpp |   4 ++
 src/sect.cc                   |   2 +-
 3 files changed, 114 insertions(+), 9 deletions(-)
diff --git a/doc/source/using.rst b/doc/source/using.rst
index 6a19a2f7..ae322ca1 100644
--- a/doc/source/using.rst
+++ b/doc/source/using.rst
@@ -27,6 +27,33 @@ Basic usage::
     kat hist [options] (<input>)+
 
 
+Output:
+
+Produces a histogram file and associated `Spectra hist`_ plot.
+The histogram file starts with a header describing settings used to generate the
+histogram.  Each header line starts with a '#' character.  The histogram that follows
+describes each K-mer frequency followed by the number of distinct K-mers found
+at that frequency separated by a space.  Each frequency / count pair is line separated.
+For example::
+
+    # Title:27-mer spectra for: SRR519624_1.1M.fastq
+    # XLabel:27-mer frequency
+    # YLabel:# distinct 27-mers
+    # Kmer value:27
+    # Input 1:SRR519624_1.1M.fastq
+    ###
+    1 47573743
+    2 4737789
+    3 732453
+    4 184505
+    5 100293
+    6 78699
+    7 68553
+    8 59589
+    9 50926
+    ...
+
+
 Applications:
 
  * Assess data quality: estimates of kmers deriving from errors; sequencing bias
@@ -51,6 +78,28 @@ Basic usage::
 
     kat gcp (<input>)+
 
+Output:
+
+Produces a matrix file and associated `Density`_ plot. The matrix file starts with 
+a header describing settings used to generate the matrix.  Each header line starts 
+with a '#' character.  The matrix that follows contains a space separated list of
+distinct k-mer counts for the GC-count indexed by the row.  Each column index represents
+the K-mer Frequency. For example::
+
+    # Title:K-mer coverage vs GC count plot for: ERR409722_1.fastq ERR409722_2.fastq 
+    # XLabel:K-mer multiplicity
+    # YLabel:GC count
+    # ZLabel:Distinct K-mers per bin
+    # Columns:1001
+    # Rows:27
+    # MaxVal:7705834
+    # Transpose:0
+    ###
+    0 65392 10715 6038 4615 3769 3140 2690 2133 1748 1519 1370 1098 840 ...
+    0 189337 30772 20040 16630 15579 13673 12809 11890 10380 9605 8403 7370 6302 ...
+    0 428150 66753 41453 37478 34599 34622 34572 32740 31487 30356 28369 26880 ...
+    ...
+
 Applications:
 
  * Assess data quality: estimates of kmers deriving from errors; sequencing bias
@@ -88,11 +137,37 @@ an assembly::
 
     kat comp -t 8 -o pe_v_asm_test 'PE1.R?.fq' asm.fa
 
+... or if the reads are gzipped::
+    
+    kat comp -t 8 -o pe_v_asm_test <(gunzip -c 'PE1.R?.fq') asm.fa
+
+Output:
+
+Produces a matrix file and by default a `Spectra CN`_ plot, although can also produce
+a `Density` plot if requested. The matrix file is structured in a similar way to the
+GCP tool with a header describing settings used to generate the matrix.  Each header line starts 
+with a '#' character.  The matrix that follows contains a space separated list of
+distinct k-mer counts for the frequency in each input file represented by the row 
+and column index. For example::
+
+    # Title:K-mer comparison plot
+    # XLabel:K-mer multiplicity for: ERR409722_1.fastq
+    # YLabel:K-mer multiplicity for: ERR409722_2.fastq
+    # ZLabel:Distinct K-mers per bin
+    # Columns:1001
+    # Rows:1001
+    # MaxVal:57106148
+    # Transpose:1
+    ###
+    0 57106148 2133673 428934 134189 45267 16399 6603 3374 2066 1371 930 752 490 ...
+    50919938 10364720 1613532 607932 239439 89985 36398 16589 8811 5469 3369 ...
+    1990321 1605550 1061952 561999 271443 125163 61769 34379 22459 15647 11171 ...
+    ...
 
 Applications:
 
  * Determine sequencing bias between left and right read pairs.
- * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness
+ * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness.
 
 
 
@@ -100,7 +175,7 @@ SECT
 ----
 
 Estimates coverage levels across sequences in the provided input sequence file.
-This tool will produce a fasta style representation of the input sequence file 
+This tool will produce a FastA style representation of the input sequence file 
 containing K-mer coverage counts mapped across each sequence.  K-mer coverage is 
 determined from the provided counts input file, which can be either one jellyfish 
 hash, or one or more FastA / FastQ files.  In addition, a space separated table 
@@ -114,12 +189,38 @@ Basic usage::
 
     kat sect [options] <sequence_file> (<input>)+
 
+Output:
+
+Produces a FastA-style representation of the K-mer frequency across each target sequence
+as well as a file describing statistics for each target sequence.  The FastA-style
+output might look like this::
+
+    >Chr4
+    31 31 31 29 29 29 28 27 27 28 28 28 30 30 30 29 29 29 29 31 32 32 33 33 33 31 29 30 ...
+
+With an associated tab separated stats file which looks like this::
+
+    seq_name median       mean      gc%  seq_length  invalid_kmers  %_invalid  non_zero_bases  %_non_zero  %_non_zero_corrected
+    Chr4         26  362.45141  0.36204    18585056           3214    0.01729        18549840    99.81065              99.82792
+
+The column headers have the following meaning:
+ * seq_name - The name of the FastA/Q sequence
+ * median - The median K-mer coverage across the sequence
+ * mean - The mean K-mer coverage across the sequence
+ * gc% - The GC% of the sequence
+ * seq_length - The length of the sequence
+ * invalid_kmers - The number of K-mers in the sequence that cannot be counted, most likely due to being non-canonical.  i.e. non A,T,G,C
+ * %_invalid - The percentage of the sequence which contains invalid K-mers
+ * non_zero_kmers - The number of K-mers that have a coverage of 1 or greater
+ * %_non_zero - The percentage of the sequence which has a K-mer coverage greater than 1
+ * %_non_zero_corrected - The percentage of the sequence which has a K-mer coverage greater than 1 but ignoring any parts of the sequence represented by invalid K-mers.
+
 
 Applications:
 
- * Analyse kmer coverage across assembled sequences
- * Compare assemblies using kmers, helpful to levels of contamination of a specific organism.
- * Contamination detection - Compare kmer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap
+ * Analyse K-mer coverage across assembled sequences
+ * Compare assemblies using K-mers, helpful to levels of contamination of a specific organism.
+ * Contamination detection - Compare K-mer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap
 
 
 Filtering tools
@@ -173,7 +274,7 @@ modify axis, titles, limits, size, resolution, etc, although they will all try t
 intelligent defaults directly from the data provided.  
 
 
-Spectra_hist
+Spectra hist
 ~~~~~~~~~~~~
 
 Visualises the K-mer spectra from ``kat hist`` or ``jellyfish histo`` output.  
@@ -234,7 +335,7 @@ Applications:
     :scale: 66%
 
 
-Spectra_CN
+Spectra CN
 ~~~~~~~~~~
 
 Shows K-mer duplication levels, which correspond to copy number variation within 
@@ -254,7 +355,7 @@ Applications:
 
 
 
-Spectra_mx
+Spectra MX
 ~~~~~~~~~~
 
 Produces K-mer spectras from rows or columns in a matrix generated by ``kat comp``.  
diff --git a/lib/include/kat/str_utils.hpp b/lib/include/kat/str_utils.hpp
index ad003371..f2a4bcb5 100644
--- a/lib/include/kat/str_utils.hpp
+++ b/lib/include/kat/str_utils.hpp
@@ -184,9 +184,13 @@ namespace kat {
         for (const auto& c : merstr) {
             switch (c) {
                 case 'A':
+				case 'a':
                 case 'T':
+				case 't':
                 case 'G':
+				case 'g':
                 case 'C':
+				case 'c':
                     break;
                 default:
                     return false;
diff --git a/src/sect.cc b/src/sect.cc
index 292f5ad7..4f55eb39 100644
--- a/src/sect.cc
+++ b/src/sect.cc
@@ -186,7 +186,7 @@ void kat::Sect::processSeqFile() {
 
     // Average sequence coverage and GC% scores output stream
     ofstream cvg_gc_stream(string(outputPrefix.string() + "-stats.tsv").c_str());
-    cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_bases\t%_invalid\tnon_zero_bases\t%_non_zero\t%_non_zero_corrected" << endl;
+    cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl;
     
     // Processes sequences in batches of records to reduce memory requirements
     while (!seqan::atEnd(reader)) {

From 50ff0f7d0062b3f2efe28bdc52b52c4fbe224e21 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 10 Mar 2017 17:24:28 +0000
Subject: [PATCH 2/5] Tweaked wording in description of SECT in docs.

---
 doc/source/using.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/using.rst b/doc/source/using.rst
index ae322ca1..20c70e44 100644
--- a/doc/source/using.rst
+++ b/doc/source/using.rst
@@ -176,11 +176,11 @@ SECT
 
 Estimates coverage levels across sequences in the provided input sequence file.
 This tool will produce a FastA style representation of the input sequence file 
-containing K-mer coverage counts mapped across each sequence.  K-mer coverage is 
+containing K-mer coverage counts mapped across each sequence separated by spaces.  
+K-mer coverage is 
 determined from the provided counts input file, which can be either one jellyfish 
-hash, or one or more FastA / FastQ files.  In addition, a space separated table 
-file containing the mean coverage score and GC of each sequence is produced.  The 
-row order is identical to the original sequence file.
+hash, or one or more FastA / FastQ files.  In addition, a file containing statistics
+about each target sequence is produced.
 
 NOTE: K-mers containing any Ns derived from sequences in the sequence file not be 
 included.
@@ -200,7 +200,7 @@ output might look like this::
 
 With an associated tab separated stats file which looks like this::
 
-    seq_name median       mean      gc%  seq_length  invalid_kmers  %_invalid  non_zero_bases  %_non_zero  %_non_zero_corrected
+    seq_name median       mean      gc%  seq_length  invalid_kmers  %_invalid  non_zero_kmers  %_non_zero  %_non_zero_corrected
     Chr4         26  362.45141  0.36204    18585056           3214    0.01729        18549840    99.81065              99.82792
 
 The column headers have the following meaning:

From fd045e9295ea3243c7ffd2acfa3821ad38812bbf Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 10 Mar 2017 17:39:35 +0000
Subject: [PATCH 3/5] Added kmers in seq to SECT stats output.

---
 doc/source/using.rst | 5 +++--
 src/sect.cc          | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/doc/source/using.rst b/doc/source/using.rst
index 20c70e44..60899374 100644
--- a/doc/source/using.rst
+++ b/doc/source/using.rst
@@ -200,8 +200,8 @@ output might look like this::
 
 With an associated tab separated stats file which looks like this::
 
-    seq_name median       mean      gc%  seq_length  invalid_kmers  %_invalid  non_zero_kmers  %_non_zero  %_non_zero_corrected
-    Chr4         26  362.45141  0.36204    18585056           3214    0.01729        18549840    99.81065              99.82792
+    seq_name median       mean      gc%  seq_length  kmers_in_seq  invalid_kmers  %_invalid  non_zero_kmers  %_non_zero  %_non_zero_corrected
+    Chr4         26  362.45141  0.36204    18585056      18585036           3214    0.01729        18549840    99.81065              99.82792
 
 The column headers have the following meaning:
  * seq_name - The name of the FastA/Q sequence
@@ -209,6 +209,7 @@ The column headers have the following meaning:
  * mean - The mean K-mer coverage across the sequence
  * gc% - The GC% of the sequence
  * seq_length - The length of the sequence
+ * kmers_in_seq - The number of K-mers in the sequence.  i.e. seq_length - K + 1
  * invalid_kmers - The number of K-mers in the sequence that cannot be counted, most likely due to being non-canonical.  i.e. non A,T,G,C
  * %_invalid - The percentage of the sequence which contains invalid K-mers
  * non_zero_kmers - The number of K-mers that have a coverage of 1 or greater
diff --git a/src/sect.cc b/src/sect.cc
index 4f55eb39..6f2b0636 100644
--- a/src/sect.cc
+++ b/src/sect.cc
@@ -186,7 +186,7 @@ void kat::Sect::processSeqFile() {
 
     // Average sequence coverage and GC% scores output stream
     ofstream cvg_gc_stream(string(outputPrefix.string() + "-stats.tsv").c_str());
-    cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl;
+    cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tkmers_in_seq\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl;
     
     // Processes sequences in batches of records to reduce memory requirements
     while (!seqan::atEnd(reader)) {
@@ -425,7 +425,8 @@ void kat::Sect::printStatTable(std::ostream &out) {
             << (*medians)[i] << "\t" 
             << (*means)[i] << "\t" 
             << (*gcs)[i] << "\t" 
-            << (*lengths)[i] << "\t" 
+            << (*lengths)[i] << "\t"
+            << (*lengths)[i] - this->input.merLen + 1 << "\t"
             << (*invalid)[i] << "\t"
             << (*percentInvalid)[i] << "\t"
             << (*nonZero)[i] << "\t"

From 46288b88030196a8b725139273f2df5866a943de Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Mon, 13 Mar 2017 12:04:24 +0000
Subject: [PATCH 4/5] Added stack trace for seg faults.

---
 src/kat.cc | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/kat.cc b/src/kat.cc
index 74f7c2dc..a1580739 100644
--- a/src/kat.cc
+++ b/src/kat.cc
@@ -19,6 +19,8 @@
 #include <config.h>
 #endif
 
+#include <execinfo.h>
+#include <signal.h>
 #include <sys/ioctl.h>
 #include <string.h>
 #include <exception>
@@ -114,6 +116,21 @@ const string helpMessage() {
 }
 
 
+void handler(int sig) {
+    void *array[10];
+    size_t size;
+
+    // get void*'s for all entries on the stack
+    size = backtrace(array, 10);
+
+    // print out all the frames to stderr
+    fprintf(stderr, "Error: signal %d:\n", sig);
+    fprintf(stderr, "Stack trace:\n");
+    backtrace_symbols_fd(array, size, STDERR_FILENO);
+    exit(1);
+}
+
+
 
 /**
  * Start point for KAT.  Processes the start of the command line and then
@@ -121,6 +138,7 @@ const string helpMessage() {
  */
 int main(int argc, char *argv[])
 {
+    signal(SIGSEGV, handler);
     try {
         // KAT args
         string modeStr;
@@ -224,11 +242,11 @@ int main(int argc, char *argv[])
         }
                 
     } catch(po::error& e) { 
-        cerr << "Error: Parsing Command Line: " << e.what() << endl; 
-        return 1; 
+        cerr << "Error: Parsing Command Line: " << e.what() << endl;
+                return 1;
     } 
     catch (boost::exception &e) { 
-        cerr << boost::diagnostic_information(e); 
+        cerr << boost::diagnostic_information(e);
         return 4;
     } catch (exception& e) {
         cerr << "Error: " << e.what() << endl;

From 17a8ec10b3ca3c4ac0f3dca26c4106213a999584 Mon Sep 17 00:00:00 2001
From: Daniel Mapleson <daniel.mapleson@tgac.ac.uk>
Date: Fri, 7 Apr 2017 15:08:35 +0100
Subject: [PATCH 5/5] Fixed a bug which was causing a seg fault when using the
 comp tool with bin sizes larger than the default value.

---
 configure.ac                      | 2 +-
 doc/source/conf.py                | 4 ++--
 lib/include/kat/comp_counters.hpp | 4 ++++
 lib/include/kat/sparse_matrix.hpp | 2 +-
 lib/src/comp_counters.cc          | 4 ++++
 src/comp.cc                       | 3 +--
 6 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/configure.ac b/configure.ac
index 36f4416b..8e70640d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4,7 +4,7 @@
 
 # Autoconf initialistion.  Sets package name version and contact details
 AC_PREREQ([2.68])
-AC_INIT([kat],[2.3.2],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT])
+AC_INIT([kat],[2.3.3],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT])
 AC_CONFIG_SRCDIR([src/kat.cc])
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_MACRO_DIR([m4])
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 0816bdf2..a7d5e020 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -52,9 +52,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '2.3.2'
+version = '2.3.3'
 # The full version, including alpha/beta/rc tags.
-release = '2.3.2'
+release = '2.3.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/lib/include/kat/comp_counters.hpp b/lib/include/kat/comp_counters.hpp
index e482a88a..5ce341d4 100644
--- a/lib/include/kat/comp_counters.hpp
+++ b/lib/include/kat/comp_counters.hpp
@@ -57,6 +57,8 @@ class CompCounters {
     path hash3_path;
 
     CompCounters();
+	
+	CompCounters(const size_t _dm_size);
 
     CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size);
 
@@ -92,6 +94,8 @@ class ThreadedCompCounters {
 public:
 
     ThreadedCompCounters();
+	
+	ThreadedCompCounters(const size_t _dm_size);
 
     ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size);
 
diff --git a/lib/include/kat/sparse_matrix.hpp b/lib/include/kat/sparse_matrix.hpp
index 7e9423c0..fed458a9 100644
--- a/lib/include/kat/sparse_matrix.hpp
+++ b/lib/include/kat/sparse_matrix.hpp
@@ -323,7 +323,7 @@ class ThreadedSparseMatrix {
     }
 
     const SM64& mergeThreadedMatricies() {
-        // Merge matrix
+		// Merge matrix
         for (int i = 0; i < width; i++) {
             for (int j = 0; j < height; j++) {
                 for (int k = 0; k < threads; k++) {
diff --git a/lib/src/comp_counters.cc b/lib/src/comp_counters.cc
index a219fb5d..37ab8ffa 100644
--- a/lib/src/comp_counters.cc
+++ b/lib/src/comp_counters.cc
@@ -39,6 +39,8 @@ using kat::DistanceMetric;
 
 kat::CompCounters::CompCounters() : CompCounters("", "", "", DEFAULT_NB_BINS) {}
 
+kat::CompCounters::CompCounters(const size_t _dm_size) : CompCounters("", "", "", _dm_size) {}
+
 kat::CompCounters::CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) :
         hash1_path(_hash1_path), hash2_path(_hash2_path), hash3_path(_hash3_path) {
 
@@ -208,6 +210,8 @@ void kat::CompCounters::printCounts(ostream &out) {
 
 kat::ThreadedCompCounters::ThreadedCompCounters() : ThreadedCompCounters("", "", "", DEFAULT_NB_BINS) {}
 
+kat::ThreadedCompCounters::ThreadedCompCounters(const size_t _dm_size) : ThreadedCompCounters("", "", "", _dm_size) {}
+
 kat::ThreadedCompCounters::ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) {
     final_matrix = CompCounters(_hash1_path, _hash2_path, _hash3_path, _dm_size);            
 }
diff --git a/src/comp.cc b/src/comp.cc
index 8664fe6f..41813776 100644
--- a/src/comp.cc
+++ b/src/comp.cc
@@ -253,7 +253,6 @@ void kat::Comp::merge() {
 
     cout << "Merging results ...";
     cout.flush();
-
     // Merge results from the threads
     main_matrix.mergeThreadedMatricies();
     if (doThirdHash()) {
@@ -390,7 +389,7 @@ void kat::Comp::compare() {
 
 void kat::Comp::compareSlice(int th_id) {
 
-    shared_ptr<CompCounters> cc = make_shared<CompCounters>();
+    shared_ptr<CompCounters> cc = make_shared<CompCounters>(std::min(this->d1Bins, this->d2Bins));
 
     // Setup iterator for this thread's chunk of hash1
     LargeHashArray::eager_iterator hash1Iterator = input[0].hash->eager_slice(th_id, threads);