From 3cb63555c4399916a1806c3487c56239b27876e3 Mon Sep 17 00:00:00 2001 From: Jeffrey Barrick Date: Fri, 7 Jun 2024 22:57:15 -0500 Subject: [PATCH] Updated handling of region strings to be more forgiving/uniform --- src/c/breseq/libbreseq/common.h | 78 +++++++++++++++++++++ src/c/breseq/libbreseq/pileup_base.h | 58 +-------------- src/c/breseq/libbreseq/reference_sequence.h | 32 +++------ 3 files changed, 89 insertions(+), 79 deletions(-) diff --git a/src/c/breseq/libbreseq/common.h b/src/c/breseq/libbreseq/common.h index 66a43dc2..36d6dcdb 100644 --- a/src/c/breseq/libbreseq/common.h +++ b/src/c/breseq/libbreseq/common.h @@ -1424,6 +1424,84 @@ template double roundp(double value) { return floor(value * nth_place + 0.5f) / nth_place; } +//! Most generic form of these functions used by both cReferenceSequence and pileup_base +//! +//! Handles complete parsing for seq_id:start.insert_start-end.insert_end form. +//! +//! This function also deals with some problem cases: +//! * Removes commas from start and end (but not seq_id) +//! * If there is only a start and no end, sets end to start +//! * Substitutes different types of dashes to hyphens + +inline void parse_region(const string& region, string& target_name, uint32_t& start_pos_1, uint32_t& end_pos_1, uint32_t& insert_start, uint32_t& insert_end) +{ + insert_start = 0; + insert_end = 0; + + size_t start = 0; + size_t end = 0; + + string start_pos_1_string; + string insert_start_string("0"); + string end_pos_1_string; + string insert_end_string("0"); + + // Split on colon + vector colon_split_list = split(region, ":"); + ASSERT(colon_split_list.size() == 2, "Expected exactly one colon in region string:" + region); + + // Save region + target_name = colon_split_list[0]; + + // Fix hyphen variants and remove commas + colon_split_list[1] = substitute(colon_split_list[1], "–", "-"); + colon_split_list[1] = substitute(colon_split_list[1], ",", ""); + + // Split on hyphen + vector start_end_split_list = split(colon_split_list[1], "-"); + ASSERT(start_end_split_list.size() <= 2, "Expected no more than one hyphen in start-end portion of region string:" + region); + + // We always have a start + { + // Split start on periods + vector start_split_list = split(start_end_split_list[0], "."); + ASSERT(start_split_list.size() <= 2, "Expected no more than one period in start of region string:" + region); + + // Save start and insert start + start_pos_1_string = start_split_list[0]; + if (start_split_list.size() == 2) insert_start_string = start_split_list[1]; + } + + // We may or may not have an end... + if (start_end_split_list.size()==1) { + // No end, copy start! + end_pos_1_string = start_pos_1_string; + insert_end_string = insert_start_string; + } else { + // Split end on periods + vector end_split_list = split(start_end_split_list[1], "."); + ASSERT(end_split_list.size() <= 2, "Expected no more than one period in end of region string:" + region); + + // Save end and insert end + end_pos_1_string = end_split_list[0]; + if (end_split_list.size() == 2) insert_end_string = end_split_list[1]; + } + + start_pos_1 = from_string(start_pos_1_string); + end_pos_1 = from_string(end_pos_1_string); + + insert_start = from_string(insert_start_string); + insert_end = from_string(insert_end_string); +} + +//! Short form for when there is no extended insert_start or insert_end +inline void parse_region(const string& region, string& target_name, uint32_t& start_pos_1, uint32_t& end_pos_1) +{ + uint32_t temp_insert_start, temp_insert_end; + parse_region(region, target_name, start_pos_1, end_pos_1, temp_insert_start, temp_insert_end); +} + + } // breseq #endif diff --git a/src/c/breseq/libbreseq/pileup_base.h b/src/c/breseq/libbreseq/pileup_base.h index 76dcc801..c9b35077 100644 --- a/src/c/breseq/libbreseq/pileup_base.h +++ b/src/c/breseq/libbreseq/pileup_base.h @@ -161,68 +161,16 @@ class pileup_base { } - //! Special parsing for seq_id:start.insert_start-end.insert_end form. void parse_region(const string& region, uint32_t& target_id, uint32_t& start_pos_1, uint32_t& end_pos_1, uint32_t& insert_start, uint32_t& insert_end) { - insert_start = 0; - insert_end = 0; - - size_t start = 0; - size_t end = 0; - - end = region.find_first_of(':', start); - assert(end != string::npos); - string target_name = region.substr(start, end - start); - start = end+1; - - string start_pos_1_string; - string insert_start_string("0"); - end = region.find_first_of('.', start); - if (end == string::npos) { - end = region.find_first_of('-', start); - assert(end != string::npos); - start_pos_1_string = region.substr(start, end - start); - start = end+1; - } - else - { - start_pos_1_string = region.substr(start, end - start); - start = end+1; - end = region.find_first_of('-', start); - assert(end != string::npos); - insert_start_string = region.substr(start, end - start); - start = end+1; - } - - string end_pos_1_string; - string insert_end_string("0"); - end = region.find_first_of('.', start); - if (end == string::npos) { - end = string::npos; - end_pos_1_string = region.substr(start, end - start); - start = end+1; - } - else - { - end_pos_1_string = region.substr(start, end - start); - start = end+1; - end = string::npos; - insert_end_string = region.substr(start, end - start); - start = end+1; - } + string target_name; + breseq::parse_region(region, target_name, start_pos_1, end_pos_1, insert_start, insert_end); - // Save target id + // Convert values to target ids and integers int32_t temp_target_id = seq_id_to_target_id(target_name); // Target was not found. ASSERT(temp_target_id != -1, "Target seq id was not found for region [" + target_name + "] using FASTA file [" + m_fasta_file_name + "].\n" + "Valid seq ids: " + join(this->valid_seq_ids(), ", ") ); target_id = static_cast(temp_target_id); - - // - start_pos_1 = from_string(start_pos_1_string); - end_pos_1 = from_string(end_pos_1_string); - - insert_start = from_string(insert_start_string); - insert_end = from_string(insert_end_string); } void parse_region(const string& region, uint32_t& target_id, uint32_t& start_pos_1, uint32_t& end_pos_1) diff --git a/src/c/breseq/libbreseq/reference_sequence.h b/src/c/breseq/libbreseq/reference_sequence.h index 1dedb980..a9e0eca0 100644 --- a/src/c/breseq/libbreseq/reference_sequence.h +++ b/src/c/breseq/libbreseq/reference_sequence.h @@ -1126,34 +1126,18 @@ class cFeatureLocationList: public list { // This is a duplicate of a function in pileup.h for when // we don't have a BAM available. void parse_region(const string& region, uint32_t& target_id, uint32_t& start_pos_1, uint32_t& end_pos_1) - { - vector split_region = split(region, ":"); - // Must check first split for second to not potentially crash - ASSERT(split_region.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]"); - vector split_positions = split(split_region[1], "-"); - ASSERT(split_positions.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]"); - - target_id = seq_id_to_index(split_region[0]); - start_pos_1 = from_string(split_positions[0]); - end_pos_1 = from_string(split_positions[1]); + { + string target_name; + parse_region(region, target_name, start_pos_1, end_pos_1); + target_id = seq_id_to_index(target_name); } - - // This parses to a string for target - static void parse_region(const string& region, string& seq_id, uint32_t& start_pos_1, uint32_t& end_pos_1) + + static void parse_region(const string& region, string& target_name, uint32_t& start_pos_1, uint32_t& end_pos_1) { - vector split_region = split(region, ":"); - // Must check first split for second to not potentially crash - ASSERT(split_region.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]"); - vector split_positions = split(split_region[1], "-"); - ASSERT(split_positions.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]"); - - seq_id = split_region[0]; - start_pos_1 = from_string(split_positions[0]); - end_pos_1 = from_string(split_positions[1]); + breseq::parse_region(region, target_name, start_pos_1, end_pos_1); } // normalize a region description - // * remove commas from numbers // * flip start and end so start is smaller (returning bool whether this was done) static bool normalize_region(string& region) { region = substitute(region, ",", ""); @@ -1162,7 +1146,7 @@ class cFeatureLocationList: public list { uint32_t start_pos_1; uint32_t end_pos_1; - parse_region(region, seq_id, start_pos_1, end_pos_1); + cReferenceSequences::parse_region(region, seq_id, start_pos_1, end_pos_1); bool reverse = false; if (start_pos_1>end_pos_1) {