Skip to content

Commit

Permalink
Updated handling of region strings to be more forgiving/uniform
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffreybarrick committed Jun 8, 2024
1 parent 3b9d870 commit 3cb6355
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 79 deletions.
78 changes: 78 additions & 0 deletions src/c/breseq/libbreseq/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -1424,6 +1424,84 @@ template<uint32_t nth_place> double roundp(double value) {
return floor(value * nth_place + 0.5f) / nth_place;
}

//! Most generic form of these functions used by both cReferenceSequence and pileup_base
//!
//! Handles complete parsing for seq_id:start.insert_start-end.insert_end form.
//!
//! This function also deals with some problem cases:
//! * Removes commas from start and end (but not seq_id)
//! * If there is only a start and no end, sets end to start
//! * Substitutes different types of dashes to hyphens

inline void parse_region(const string& region, string& target_name, uint32_t& start_pos_1, uint32_t& end_pos_1, uint32_t& insert_start, uint32_t& insert_end)
{
insert_start = 0;
insert_end = 0;

size_t start = 0;
size_t end = 0;

string start_pos_1_string;
string insert_start_string("0");
string end_pos_1_string;
string insert_end_string("0");

// Split on colon
vector<string> colon_split_list = split(region, ":");
ASSERT(colon_split_list.size() == 2, "Expected exactly one colon in region string:" + region);

// Save region
target_name = colon_split_list[0];

// Fix hyphen variants and remove commas
colon_split_list[1] = substitute(colon_split_list[1], "", "-");
colon_split_list[1] = substitute(colon_split_list[1], ",", "");

// Split on hyphen
vector<string> start_end_split_list = split(colon_split_list[1], "-");
ASSERT(start_end_split_list.size() <= 2, "Expected no more than one hyphen in start-end portion of region string:" + region);

// We always have a start
{
// Split start on periods
vector<string> start_split_list = split(start_end_split_list[0], ".");
ASSERT(start_split_list.size() <= 2, "Expected no more than one period in start of region string:" + region);

// Save start and insert start
start_pos_1_string = start_split_list[0];
if (start_split_list.size() == 2) insert_start_string = start_split_list[1];
}

// We may or may not have an end...
if (start_end_split_list.size()==1) {
// No end, copy start!
end_pos_1_string = start_pos_1_string;
insert_end_string = insert_start_string;
} else {
// Split end on periods
vector<string> end_split_list = split(start_end_split_list[1], ".");
ASSERT(end_split_list.size() <= 2, "Expected no more than one period in end of region string:" + region);

// Save end and insert end
end_pos_1_string = end_split_list[0];
if (end_split_list.size() == 2) insert_end_string = end_split_list[1];
}

start_pos_1 = from_string<uint32_t>(start_pos_1_string);
end_pos_1 = from_string<uint32_t>(end_pos_1_string);

insert_start = from_string<uint32_t>(insert_start_string);
insert_end = from_string<uint32_t>(insert_end_string);
}

//! Short form for when there is no extended insert_start or insert_end
inline void parse_region(const string& region, string& target_name, uint32_t& start_pos_1, uint32_t& end_pos_1)
{
uint32_t temp_insert_start, temp_insert_end;
parse_region(region, target_name, start_pos_1, end_pos_1, temp_insert_start, temp_insert_end);
}


} // breseq

#endif
58 changes: 3 additions & 55 deletions src/c/breseq/libbreseq/pileup_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,68 +161,16 @@ class pileup_base {
}


//! Special parsing for seq_id:start.insert_start-end.insert_end form.
void parse_region(const string& region, uint32_t& target_id, uint32_t& start_pos_1, uint32_t& end_pos_1, uint32_t& insert_start, uint32_t& insert_end)
{
insert_start = 0;
insert_end = 0;

size_t start = 0;
size_t end = 0;

end = region.find_first_of(':', start);
assert(end != string::npos);
string target_name = region.substr(start, end - start);
start = end+1;

string start_pos_1_string;
string insert_start_string("0");
end = region.find_first_of('.', start);
if (end == string::npos) {
end = region.find_first_of('-', start);
assert(end != string::npos);
start_pos_1_string = region.substr(start, end - start);
start = end+1;
}
else
{
start_pos_1_string = region.substr(start, end - start);
start = end+1;
end = region.find_first_of('-', start);
assert(end != string::npos);
insert_start_string = region.substr(start, end - start);
start = end+1;
}

string end_pos_1_string;
string insert_end_string("0");
end = region.find_first_of('.', start);
if (end == string::npos) {
end = string::npos;
end_pos_1_string = region.substr(start, end - start);
start = end+1;
}
else
{
end_pos_1_string = region.substr(start, end - start);
start = end+1;
end = string::npos;
insert_end_string = region.substr(start, end - start);
start = end+1;
}
string target_name;
breseq::parse_region(region, target_name, start_pos_1, end_pos_1, insert_start, insert_end);

// Save target id
// Convert values to target ids and integers
int32_t temp_target_id = seq_id_to_target_id(target_name);
// Target was not found.
ASSERT(temp_target_id != -1, "Target seq id was not found for region [" + target_name + "] using FASTA file [" + m_fasta_file_name + "].\n" + "Valid seq ids: " + join(this->valid_seq_ids(), ", ") );
target_id = static_cast<uint32_t>(temp_target_id);

//
start_pos_1 = from_string<uint32_t>(start_pos_1_string);
end_pos_1 = from_string<uint32_t>(end_pos_1_string);

insert_start = from_string<uint32_t>(insert_start_string);
insert_end = from_string<uint32_t>(insert_end_string);
}

void parse_region(const string& region, uint32_t& target_id, uint32_t& start_pos_1, uint32_t& end_pos_1)
Expand Down
32 changes: 8 additions & 24 deletions src/c/breseq/libbreseq/reference_sequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -1126,34 +1126,18 @@ class cFeatureLocationList: public list<cFeatureLocation> {
// This is a duplicate of a function in pileup.h for when
// we don't have a BAM available.
void parse_region(const string& region, uint32_t& target_id, uint32_t& start_pos_1, uint32_t& end_pos_1)
{
vector<string> split_region = split(region, ":");
// Must check first split for second to not potentially crash
ASSERT(split_region.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]");
vector<string> split_positions = split(split_region[1], "-");
ASSERT(split_positions.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]");

target_id = seq_id_to_index(split_region[0]);
start_pos_1 = from_string<uint32_t>(split_positions[0]);
end_pos_1 = from_string<uint32_t>(split_positions[1]);
{
string target_name;
parse_region(region, target_name, start_pos_1, end_pos_1);
target_id = seq_id_to_index(target_name);
}

// This parses to a string for target
static void parse_region(const string& region, string& seq_id, uint32_t& start_pos_1, uint32_t& end_pos_1)

static void parse_region(const string& region, string& target_name, uint32_t& start_pos_1, uint32_t& end_pos_1)
{
vector<string> split_region = split(region, ":");
// Must check first split for second to not potentially crash
ASSERT(split_region.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]");
vector<string> split_positions = split(split_region[1], "-");
ASSERT(split_positions.size() == 2, "Unrecognized region: " + region + "\nExpected format [seq_id:start-end]");

seq_id = split_region[0];
start_pos_1 = from_string<uint32_t>(split_positions[0]);
end_pos_1 = from_string<uint32_t>(split_positions[1]);
breseq::parse_region(region, target_name, start_pos_1, end_pos_1);
}

// normalize a region description
// * remove commas from numbers
// * flip start and end so start is smaller (returning bool whether this was done)
static bool normalize_region(string& region) {
region = substitute(region, ",", "");
Expand All @@ -1162,7 +1146,7 @@ class cFeatureLocationList: public list<cFeatureLocation> {
uint32_t start_pos_1;
uint32_t end_pos_1;

parse_region(region, seq_id, start_pos_1, end_pos_1);
cReferenceSequences::parse_region(region, seq_id, start_pos_1, end_pos_1);
bool reverse = false;

if (start_pos_1>end_pos_1) {
Expand Down

0 comments on commit 3cb6355

Please sign in to comment.