diff --git a/configure.ac b/configure.ac
index 36f4416b..8e70640d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -4,7 +4,7 @@
# Autoconf initialistion. Sets package name version and contact details
AC_PREREQ([2.68])
-AC_INIT([kat],[2.3.2],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT])
+AC_INIT([kat],[2.3.3],[https://github.com/TGAC/KAT/issues],[kat],[https://github.com/TGAC/KAT])
AC_CONFIG_SRCDIR([src/kat.cc])
AC_CONFIG_AUX_DIR([build-aux])
AC_CONFIG_MACRO_DIR([m4])
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 0816bdf2..a7d5e020 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -52,9 +52,9 @@
# built documents.
#
# The short X.Y version.
-version = '2.3.2'
+version = '2.3.3'
# The full version, including alpha/beta/rc tags.
-release = '2.3.2'
+release = '2.3.3'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
diff --git a/doc/source/using.rst b/doc/source/using.rst
index 6a19a2f7..60899374 100644
--- a/doc/source/using.rst
+++ b/doc/source/using.rst
@@ -27,6 +27,33 @@ Basic usage::
kat hist [options] ()+
+Output:
+
+Produces a histogram file and associated `Spectra hist`_ plot.
+The histogram file starts with a header describing settings used to generate the
+histogram. Each header line starts with a '#' character. The histogram that follows
+describes each K-mer frequency followed by the number of distinct K-mers found
+at that frequency separated by a space. Each frequency / count pair is line separated.
+For example::
+
+ # Title:27-mer spectra for: SRR519624_1.1M.fastq
+ # XLabel:27-mer frequency
+ # YLabel:# distinct 27-mers
+ # Kmer value:27
+ # Input 1:SRR519624_1.1M.fastq
+ ###
+ 1 47573743
+ 2 4737789
+ 3 732453
+ 4 184505
+ 5 100293
+ 6 78699
+ 7 68553
+ 8 59589
+ 9 50926
+ ...
+
+
Applications:
* Assess data quality: estimates of kmers deriving from errors; sequencing bias
@@ -51,6 +78,28 @@ Basic usage::
kat gcp ()+
+Output:
+
+Produces a matrix file and associated `Density`_ plot. The matrix file starts with
+a header describing settings used to generate the matrix. Each header line starts
+with a '#' character. The matrix that follows contains a space separated list of
+distinct k-mer counts for the GC-count indexed by the row. Each column index represents
+the K-mer Frequency. For example::
+
+ # Title:K-mer coverage vs GC count plot for: ERR409722_1.fastq ERR409722_2.fastq
+ # XLabel:K-mer multiplicity
+ # YLabel:GC count
+ # ZLabel:Distinct K-mers per bin
+ # Columns:1001
+ # Rows:27
+ # MaxVal:7705834
+ # Transpose:0
+ ###
+ 0 65392 10715 6038 4615 3769 3140 2690 2133 1748 1519 1370 1098 840 ...
+ 0 189337 30772 20040 16630 15579 13673 12809 11890 10380 9605 8403 7370 6302 ...
+ 0 428150 66753 41453 37478 34599 34622 34572 32740 31487 30356 28369 26880 ...
+ ...
+
Applications:
* Assess data quality: estimates of kmers deriving from errors; sequencing bias
@@ -88,11 +137,37 @@ an assembly::
kat comp -t 8 -o pe_v_asm_test 'PE1.R?.fq' asm.fa
+... or if the reads are gzipped::
+
+ kat comp -t 8 -o pe_v_asm_test <(gunzip -c 'PE1.R?.fq') asm.fa
+
+Output:
+
+Produces a matrix file and by default a `Spectra CN`_ plot, although can also produce
+a `Density` plot if requested. The matrix file is structured in a similar way to the
+GCP tool with a header describing settings used to generate the matrix. Each header line starts
+with a '#' character. The matrix that follows contains a space separated list of
+distinct k-mer counts for the frequency in each input file represented by the row
+and column index. For example::
+
+ # Title:K-mer comparison plot
+ # XLabel:K-mer multiplicity for: ERR409722_1.fastq
+ # YLabel:K-mer multiplicity for: ERR409722_2.fastq
+ # ZLabel:Distinct K-mers per bin
+ # Columns:1001
+ # Rows:1001
+ # MaxVal:57106148
+ # Transpose:1
+ ###
+ 0 57106148 2133673 428934 134189 45267 16399 6603 3374 2066 1371 930 752 490 ...
+ 50919938 10364720 1613532 607932 239439 89985 36398 16589 8811 5469 3369 ...
+ 1990321 1605550 1061952 561999 271443 125163 61769 34379 22459 15647 11171 ...
+ ...
Applications:
* Determine sequencing bias between left and right read pairs.
- * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness
+ * Compare the kmer spectrum of input reads against an assembly to gauge assembly completeness.
@@ -100,12 +175,12 @@ SECT
----
Estimates coverage levels across sequences in the provided input sequence file.
-This tool will produce a fasta style representation of the input sequence file
-containing K-mer coverage counts mapped across each sequence. K-mer coverage is
+This tool will produce a FastA style representation of the input sequence file
+containing K-mer coverage counts mapped across each sequence separated by spaces.
+K-mer coverage is
determined from the provided counts input file, which can be either one jellyfish
-hash, or one or more FastA / FastQ files. In addition, a space separated table
-file containing the mean coverage score and GC of each sequence is produced. The
-row order is identical to the original sequence file.
+hash, or one or more FastA / FastQ files. In addition, a file containing statistics
+about each target sequence is produced.
NOTE: K-mers containing any Ns derived from sequences in the sequence file not be
included.
@@ -114,12 +189,39 @@ Basic usage::
kat sect [options] ()+
+Output:
+
+Produces a FastA-style representation of the K-mer frequency across each target sequence
+as well as a file describing statistics for each target sequence. The FastA-style
+output might look like this::
+
+ >Chr4
+ 31 31 31 29 29 29 28 27 27 28 28 28 30 30 30 29 29 29 29 31 32 32 33 33 33 31 29 30 ...
+
+With an associated tab separated stats file which looks like this::
+
+ seq_name median mean gc% seq_length kmers_in_seq invalid_kmers %_invalid non_zero_kmers %_non_zero %_non_zero_corrected
+ Chr4 26 362.45141 0.36204 18585056 18585036 3214 0.01729 18549840 99.81065 99.82792
+
+The column headers have the following meaning:
+ * seq_name - The name of the FastA/Q sequence
+ * median - The median K-mer coverage across the sequence
+ * mean - The mean K-mer coverage across the sequence
+ * gc% - The GC% of the sequence
+ * seq_length - The length of the sequence
+ * kmers_in_seq - The number of K-mers in the sequence. i.e. seq_length - K + 1
+ * invalid_kmers - The number of K-mers in the sequence that cannot be counted, most likely due to being non-canonical. i.e. non A,T,G,C
+ * %_invalid - The percentage of the sequence which contains invalid K-mers
+ * non_zero_kmers - The number of K-mers that have a coverage of 1 or greater
+ * %_non_zero - The percentage of the sequence which has a K-mer coverage greater than 1
+ * %_non_zero_corrected - The percentage of the sequence which has a K-mer coverage greater than 1 but ignoring any parts of the sequence represented by invalid K-mers.
+
Applications:
- * Analyse kmer coverage across assembled sequences
- * Compare assemblies using kmers, helpful to levels of contamination of a specific organism.
- * Contamination detection - Compare kmer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap
+ * Analyse K-mer coverage across assembled sequences
+ * Compare assemblies using K-mers, helpful to levels of contamination of a specific organism.
+ * Contamination detection - Compare K-mer spectrum against assembly providing average coverage and GC values for each contig, which can be 2D binned and plot as a heatmap
Filtering tools
@@ -173,7 +275,7 @@ modify axis, titles, limits, size, resolution, etc, although they will all try t
intelligent defaults directly from the data provided.
-Spectra_hist
+Spectra hist
~~~~~~~~~~~~
Visualises the K-mer spectra from ``kat hist`` or ``jellyfish histo`` output.
@@ -234,7 +336,7 @@ Applications:
:scale: 66%
-Spectra_CN
+Spectra CN
~~~~~~~~~~
Shows K-mer duplication levels, which correspond to copy number variation within
@@ -254,7 +356,7 @@ Applications:
-Spectra_mx
+Spectra MX
~~~~~~~~~~
Produces K-mer spectras from rows or columns in a matrix generated by ``kat comp``.
diff --git a/lib/include/kat/comp_counters.hpp b/lib/include/kat/comp_counters.hpp
index e482a88a..5ce341d4 100644
--- a/lib/include/kat/comp_counters.hpp
+++ b/lib/include/kat/comp_counters.hpp
@@ -57,6 +57,8 @@ class CompCounters {
path hash3_path;
CompCounters();
+
+ CompCounters(const size_t _dm_size);
CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size);
@@ -92,6 +94,8 @@ class ThreadedCompCounters {
public:
ThreadedCompCounters();
+
+ ThreadedCompCounters(const size_t _dm_size);
ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size);
diff --git a/lib/include/kat/sparse_matrix.hpp b/lib/include/kat/sparse_matrix.hpp
index 7e9423c0..fed458a9 100644
--- a/lib/include/kat/sparse_matrix.hpp
+++ b/lib/include/kat/sparse_matrix.hpp
@@ -323,7 +323,7 @@ class ThreadedSparseMatrix {
}
const SM64& mergeThreadedMatricies() {
- // Merge matrix
+ // Merge matrix
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
for (int k = 0; k < threads; k++) {
diff --git a/lib/include/kat/str_utils.hpp b/lib/include/kat/str_utils.hpp
index ad003371..f2a4bcb5 100644
--- a/lib/include/kat/str_utils.hpp
+++ b/lib/include/kat/str_utils.hpp
@@ -184,9 +184,13 @@ namespace kat {
for (const auto& c : merstr) {
switch (c) {
case 'A':
+ case 'a':
case 'T':
+ case 't':
case 'G':
+ case 'g':
case 'C':
+ case 'c':
break;
default:
return false;
diff --git a/lib/src/comp_counters.cc b/lib/src/comp_counters.cc
index a219fb5d..37ab8ffa 100644
--- a/lib/src/comp_counters.cc
+++ b/lib/src/comp_counters.cc
@@ -39,6 +39,8 @@ using kat::DistanceMetric;
kat::CompCounters::CompCounters() : CompCounters("", "", "", DEFAULT_NB_BINS) {}
+kat::CompCounters::CompCounters(const size_t _dm_size) : CompCounters("", "", "", _dm_size) {}
+
kat::CompCounters::CompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) :
hash1_path(_hash1_path), hash2_path(_hash2_path), hash3_path(_hash3_path) {
@@ -208,6 +210,8 @@ void kat::CompCounters::printCounts(ostream &out) {
kat::ThreadedCompCounters::ThreadedCompCounters() : ThreadedCompCounters("", "", "", DEFAULT_NB_BINS) {}
+kat::ThreadedCompCounters::ThreadedCompCounters(const size_t _dm_size) : ThreadedCompCounters("", "", "", _dm_size) {}
+
kat::ThreadedCompCounters::ThreadedCompCounters(const path& _hash1_path, const path& _hash2_path, const path& _hash3_path, const size_t _dm_size) {
final_matrix = CompCounters(_hash1_path, _hash2_path, _hash3_path, _dm_size);
}
diff --git a/src/comp.cc b/src/comp.cc
index 8664fe6f..41813776 100644
--- a/src/comp.cc
+++ b/src/comp.cc
@@ -253,7 +253,6 @@ void kat::Comp::merge() {
cout << "Merging results ...";
cout.flush();
-
// Merge results from the threads
main_matrix.mergeThreadedMatricies();
if (doThirdHash()) {
@@ -390,7 +389,7 @@ void kat::Comp::compare() {
void kat::Comp::compareSlice(int th_id) {
- shared_ptr cc = make_shared();
+ shared_ptr cc = make_shared(std::min(this->d1Bins, this->d2Bins));
// Setup iterator for this thread's chunk of hash1
LargeHashArray::eager_iterator hash1Iterator = input[0].hash->eager_slice(th_id, threads);
diff --git a/src/kat.cc b/src/kat.cc
index 74f7c2dc..a1580739 100644
--- a/src/kat.cc
+++ b/src/kat.cc
@@ -19,6 +19,8 @@
#include
#endif
+#include
+#include
#include
#include
#include
@@ -114,6 +116,21 @@ const string helpMessage() {
}
+void handler(int sig) {
+ void *array[10];
+ size_t size;
+
+ // get void*'s for all entries on the stack
+ size = backtrace(array, 10);
+
+ // print out all the frames to stderr
+ fprintf(stderr, "Error: signal %d:\n", sig);
+ fprintf(stderr, "Stack trace:\n");
+ backtrace_symbols_fd(array, size, STDERR_FILENO);
+ exit(1);
+}
+
+
/**
* Start point for KAT. Processes the start of the command line and then
@@ -121,6 +138,7 @@ const string helpMessage() {
*/
int main(int argc, char *argv[])
{
+ signal(SIGSEGV, handler);
try {
// KAT args
string modeStr;
@@ -224,11 +242,11 @@ int main(int argc, char *argv[])
}
} catch(po::error& e) {
- cerr << "Error: Parsing Command Line: " << e.what() << endl;
- return 1;
+ cerr << "Error: Parsing Command Line: " << e.what() << endl;
+ return 1;
}
catch (boost::exception &e) {
- cerr << boost::diagnostic_information(e);
+ cerr << boost::diagnostic_information(e);
return 4;
} catch (exception& e) {
cerr << "Error: " << e.what() << endl;
diff --git a/src/sect.cc b/src/sect.cc
index 292f5ad7..6f2b0636 100644
--- a/src/sect.cc
+++ b/src/sect.cc
@@ -186,7 +186,7 @@ void kat::Sect::processSeqFile() {
// Average sequence coverage and GC% scores output stream
ofstream cvg_gc_stream(string(outputPrefix.string() + "-stats.tsv").c_str());
- cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tinvalid_bases\t%_invalid\tnon_zero_bases\t%_non_zero\t%_non_zero_corrected" << endl;
+ cvg_gc_stream << "seq_name\tmedian\tmean\tgc%\tseq_length\tkmers_in_seq\tinvalid_kmers\t%_invalid\tnon_zero_kmers\t%_non_zero\t%_non_zero_corrected" << endl;
// Processes sequences in batches of records to reduce memory requirements
while (!seqan::atEnd(reader)) {
@@ -425,7 +425,8 @@ void kat::Sect::printStatTable(std::ostream &out) {
<< (*medians)[i] << "\t"
<< (*means)[i] << "\t"
<< (*gcs)[i] << "\t"
- << (*lengths)[i] << "\t"
+ << (*lengths)[i] << "\t"
+ << (*lengths)[i] - this->input.merLen + 1 << "\t"
<< (*invalid)[i] << "\t"
<< (*percentInvalid)[i] << "\t"
<< (*nonZero)[i] << "\t"