diff --git a/README.md b/README.md index a779e2b..324d3f8 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,36 @@ -# B.I.O -- The Biological Input-Output library +# B.I.O. – the Biological Input/Output library +B.I.O. is a C++ library for reading and writing files in the field of Bioinformatics and in particular sequence +analysis. It provides easy-to-use interfaces for the following formats: + * Plain I/O: plain-text, CSV, TSV, … + * Map I/O: SAM, BAM, … + * Seq I/O: FastA, FastQ, … + * Var I/O: VCF, BCF, … + +The primary goal of this library is to offer higher level abstractions than the C libraries typically used in this +domain (e.g. htslib) while at the same time offering an excellent performance. +It hopes to offer a modern, well-integrated design that covers most typical I/O use-cases Bioinformaticians encounter. + +The library relies strongly on *Modern C++* and plays well with other Modern C++ libraries. + +Please see the [online documentation](TODO) for more details. + +## Current state + +The library is currently under heavy development. There is no release, yet, and all interfaces are subject to change. + +## Dependencies + +| | requirement | version | comment | +|-------------------|-------------------------------------------|----------|---------------------------------------------| +|**compiler** | [GCC](https://gcc.gnu.org) | ≥ 10 | no other compiler is currently supported! | +|**required libs** | [SeqAn3](https://github.com/seqan/seqan3) | ≥ 3 | | +|**optional libs** | [zlib](https://github.com/madler/zlib) | ≥ 1.2 | required for `*.gz` and `.bam` file support | +| | [bzip2](https://www.sourceware.org/bzip2) | ≥ 1.0 | required for `*.bz2` file support | + +## Usage + +* Using the library entails no build-steps, it is header-only and can be used as-is. +* A single-header version is available (TODO). +* CMake files are provided for easy integration into applications (and automatic detection/inclusion of dependencies). diff --git a/include/bio/format/bcf_input_handler.hpp b/include/bio/format/bcf_input_handler.hpp index b0cd4b7..757d4fa 100644 --- a/include/bio/format/bcf_input_handler.hpp +++ b/include/bio/format/bcf_input_handler.hpp @@ -34,7 +34,7 @@ #include #include #include -#include + #include #include #include //TODO for field_types_raw; move somewhere else @@ -587,8 +587,8 @@ class format_input_handler : public format_input_handler_base - static inline void dynamic_type_init_single(std::byte const *& cache_ptr, auto & output) + template + static inline void info_element_value_type_init_single(std::byte const *& cache_ptr, auto & output) { constexpr size_t id = static_cast(id_); auto & output_ = output.template emplace(); @@ -597,8 +597,10 @@ class format_input_handler : public format_input_handler_base - static inline void dynamic_type_init_string(size_t const size, std::byte const *& cache_ptr, auto & output) + template + static inline void info_element_value_type_init_string(size_t const size, + std::byte const *& cache_ptr, + auto & output) { constexpr size_t id = static_cast(id_); auto & output_ = output.template emplace(); @@ -610,8 +612,10 @@ class format_input_handler : public format_input_handler_base - static inline void dynamic_type_init_vector(size_t const size, std::byte const *& cache_ptr, auto & output) + template + static inline void info_element_value_type_init_vector(size_t const size, + std::byte const *& cache_ptr, + auto & output) { constexpr size_t id = static_cast(id_); auto & output_ = output.template emplace(); @@ -623,10 +627,10 @@ class format_input_handler : public format_input_handler_base - static inline void dynamic_type_init_vector_of_string(size_t const size, - std::byte const *& cache_ptr, - auto & output) + template + static inline void info_element_value_type_init_vector_of_string(size_t const size, + std::byte const *& cache_ptr, + auto & output) { constexpr size_t id = static_cast(id_); auto & output_ = output.template emplace(); @@ -642,11 +646,11 @@ class format_input_handler : public format_input_handler_base - static inline void dynamic_type_init_vector_of_vector(size_t const outer_size, - size_t const inner_size, - std::byte const *& cache_ptr, - auto & output) + template + static inline void info_element_value_type_init_vector_of_vector(size_t const outer_size, + size_t const inner_size, + std::byte const *& cache_ptr, + auto & output) { constexpr size_t id = static_cast(id_); auto & output_ = output.template emplace(); @@ -666,20 +670,20 @@ class format_input_handler : public format_input_handler_base - void parse_dynamic_type(var_io::dynamic_type_id const id_from_header, - detail::bcf_type_descriptor const desc, - size_t const size, - std::byte const *& cache_ptr, - dyn_t & output); // implementation below class - - template - void parse_dynamic_type(var_io::dynamic_type_id const id_from_header, - detail::bcf_type_descriptor const desc, - size_t const outer_size, - size_t const inner_size, - std::byte const *& cache_ptr, - dyn_t & output); // implementation below class + template + void parse_info_element_value_type(var_io::value_type_id const id_from_header, + detail::bcf_type_descriptor const desc, + size_t const size, + std::byte const *& cache_ptr, + dyn_t & output); // implementation below class + + template + void parse_info_element_value_type(var_io::value_type_id const id_from_header, + detail::bcf_type_descriptor const desc, + size_t const outer_size, + size_t const inner_size, + std::byte const *& cache_ptr, + dyn_t & output); // implementation below class //!\} /*!\name Parsed record handling @@ -790,11 +794,11 @@ class format_input_handler : public format_input_handler_base) detail::string_copy(header.infos[header.idx_to_info_pos().at(idx)].id, id); @@ -846,7 +850,7 @@ class format_input_handler : public format_input_handler_base raw_field = get(raw_record); @@ -878,12 +882,12 @@ class format_input_handler : public format_input_handler_basen_sample, - fmt_size, - cache_ptr, - parsed_variant); + parse_info_element_value_type(var_io::value_type_id::vector_of_int32, + fmt_type, + record_core->n_sample, + fmt_size, + cache_ptr, + parsed_variant); /* we transform number to string and store in caches */ std::visit( @@ -912,7 +916,7 @@ class format_input_handler : public format_input_handler_base(var_io::dynamic_type_id::string); + constexpr size_t string_id = static_cast(var_io::value_type_id::string); auto & strings = parsed_variant.template emplace(); strings.resize(record_core->n_sample); @@ -926,66 +930,26 @@ class format_input_handler : public format_input_handler_basen_sample, fmt_size, cache_ptr, parsed_variant); + parse_info_element_value_type(format.type, + fmt_type, + record_core->n_sample, + fmt_size, + cache_ptr, + parsed_variant); } } assert(cache_ptr == raw_field.data() + raw_field.size()); } - //!\brief Reading of the GENOTYPES field (BCF-style). + //!\brief Reading of the GENOTYPES field. template - requires detail::genotype_bcf_style_reader_concept> + requires detail::genotype_reader_concept> void parse_field(vtag_t const & /**/, field_t & parsed_field) { parse_genotypes_impl(parsed_field); } - //!\brief Implementation for parsing into vcf-style genotypes. - void parse_genotypes_impl_vcf_style(auto & genotypes_cache, auto & parsed_field) - { - genotypes_cache.clear(); - parse_genotypes_impl(genotypes_cache); // we parse into BCF-style caches and convert - - auto & [parsed_formats, parsed_samples] = parsed_field; - - /* formats */ - for (auto & [idx, dyn_type] : genotypes_cache) - { - var_io::header::format_t & format = header.formats[header.idx_to_format_pos().at(idx)]; - - parsed_formats.push_back({}); - detail::string_copy(format.id, parsed_formats.back()); - } - - /* samples */ - parsed_samples.resize(record_core->n_sample); - for (size_t s = 0; s < record_core->n_sample; ++s) - { - parsed_samples[s].resize(parsed_formats.size()); - for (size_t f = 0; f < parsed_formats.size(); ++f) - { - auto & output = parsed_samples[s][f]; - auto visitor = [s, &output](auto & in) { output = std::move(in[s]); }; - - std::visit(visitor, detail::get_second(genotypes_cache[f])); - } - } - } - - //!\brief Reading of the GENOTYPES field (VCF-style). - void parse_field(vtag_t const & /**/, - detail::genotypes_vcf_style_reader_concept auto & parsed_field) - { - using parsed_field_t = decltype(parsed_field); - using dyn_t = std::ranges::range_value_t>>; - - if constexpr (std::same_as, dyn_t>) - parse_genotypes_impl_vcf_style(shallow_genotypes_cache, parsed_field); - else - parse_genotypes_impl_vcf_style(deep_genotypes_cache, parsed_field); - } - //!\brief Overload for parsing the private data. void parse_field(vtag_t const & /**/, var_io::record_private_data & parsed_field) { @@ -1028,7 +992,7 @@ class format_input_handler : public format_input_handler_base : public format_input_handler_base -inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_id const id_from_header, - detail::bcf_type_descriptor const desc, - size_t const size, - std::byte const *& cache_ptr, - dyn_t & output) +template +inline void format_input_handler::parse_info_element_value_type(var_io::value_type_id const id_from_header, + detail::bcf_type_descriptor const desc, + size_t const size, + std::byte const *& cache_ptr, + dyn_t & output) { // TODO DRY out the boilerplate error messages - if (static_cast(id_from_header) < static_cast(var_io::dynamic_type_id::string) && size != 1) + if (static_cast(id_from_header) < static_cast(var_io::value_type_id::string) && size != 1) error("BCF data field expected exactly one element, got:", size); switch (id_from_header) { - case var_io::dynamic_type_id::char8: + case var_io::value_type_id::char8: { if (desc != detail::bcf_type_descriptor::char8) error("Attempting to create char but the byte descriptor does not indicate char type."); - dynamic_type_init_single(cache_ptr, output); + info_element_value_type_init_single(cache_ptr, output); return; } - case var_io::dynamic_type_id::int8: - case var_io::dynamic_type_id::int16: - case var_io::dynamic_type_id::int32: + case var_io::value_type_id::int8: + case var_io::value_type_id::int16: + case var_io::value_type_id::int32: { switch (desc) { case detail::bcf_type_descriptor::int8: - dynamic_type_init_single(cache_ptr, output); + info_element_value_type_init_single(cache_ptr, output); break; case detail::bcf_type_descriptor::int16: - dynamic_type_init_single(cache_ptr, output); + info_element_value_type_init_single(cache_ptr, output); break; case detail::bcf_type_descriptor::int32: - dynamic_type_init_single(cache_ptr, output); + info_element_value_type_init_single(cache_ptr, output); break; default: error("Attempting to create int but the byte descriptor does not indicate int type."); } return; } - case var_io::dynamic_type_id::float32: + case var_io::value_type_id::float32: { if (desc != detail::bcf_type_descriptor::float32) error("Attempting to create float but the byte descriptor does not indicate float type."); - dynamic_type_init_single(cache_ptr, output); + info_element_value_type_init_single(cache_ptr, output); return; } - case var_io::dynamic_type_id::string: + case var_io::value_type_id::string: { if (desc != detail::bcf_type_descriptor::char8) error("Attempting to creates string but the byte descriptor does not indicate string type."); - dynamic_type_init_string(size, cache_ptr, output); + info_element_value_type_init_string(size, cache_ptr, output); return; } - case var_io::dynamic_type_id::vector_of_char8: + case var_io::value_type_id::vector_of_char8: { if (desc != detail::bcf_type_descriptor::char8) error("Attempting to create vector of char but the byte descriptor does not indicate char type."); - dynamic_type_init_vector(size, cache_ptr, output); + info_element_value_type_init_vector(size, + cache_ptr, + output); return; } - case var_io::dynamic_type_id::vector_of_int8: - case var_io::dynamic_type_id::vector_of_int16: - case var_io::dynamic_type_id::vector_of_int32: + case var_io::value_type_id::vector_of_int8: + case var_io::value_type_id::vector_of_int16: + case var_io::value_type_id::vector_of_int32: { switch (desc) { case detail::bcf_type_descriptor::int8: { - dynamic_type_init_vector(size, - cache_ptr, - output); + info_element_value_type_init_vector( + size, + cache_ptr, + output); break; } case detail::bcf_type_descriptor::int16: { - dynamic_type_init_vector(size, - cache_ptr, - output); + info_element_value_type_init_vector( + size, + cache_ptr, + output); break; } case detail::bcf_type_descriptor::int32: { - dynamic_type_init_vector(size, - cache_ptr, - output); + info_element_value_type_init_vector( + size, + cache_ptr, + output); break; } default: @@ -1148,26 +1117,30 @@ inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_i } return; } - case var_io::dynamic_type_id::vector_of_float32: + case var_io::value_type_id::vector_of_float32: { if (desc != detail::bcf_type_descriptor::float32) error("Attempting to create vector of float but the byte descriptor does not indicate float type."); - dynamic_type_init_vector(size, cache_ptr, output); + info_element_value_type_init_vector(size, + cache_ptr, + output); return; } - case var_io::dynamic_type_id::vector_of_string: + case var_io::value_type_id::vector_of_string: { if (desc != detail::bcf_type_descriptor::char8) error( "Attempting to create vector of string but the byte descriptor does not indicate char alphabet"); - dynamic_type_init_vector_of_string(size, cache_ptr, output); + info_element_value_type_init_vector_of_string(size, + cache_ptr, + output); return; } - case var_io::dynamic_type_id::flag: + case var_io::value_type_id::flag: { - constexpr size_t id = static_cast(var_io::dynamic_type_id::flag); + constexpr size_t id = static_cast(var_io::value_type_id::flag); output.template emplace(true); cache_ptr += size; // This should be 0, but is allowed to be something else @@ -1177,61 +1150,61 @@ inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_i } /*!\brief Parse a "dynamically typed" field out of a BCF stream and store the content in a vector-variant. - * \tparam dyn_t Type of the variant; specialisation of seqan3::var_io::dynamic_type. - * \param[in] id_from_header A value of seqan3::var_io::dynamic_type_id that denotes the expected type. + * \tparam dyn_t Type of the variant; specialisation of seqan3::var_io::info_element_value_type. + * \param[in] id_from_header A value of seqan3::var_io::value_type_id that denotes the expected type. * \param[in] desc A value of seqan3::detail::bcf_type_descriptor that notes the detected type. * \param[in] outer_size The number of values belonging to this field. * \param[in] inner_size The number of values per inner vector in case of vector-of-vector. * \param[in,out] cache_ptr Pointer into the BCF stream; will be updated to point past the end of read data. * \param[out] output The variant to hold the parsed value. */ -template -inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_id const id_from_header, - detail::bcf_type_descriptor const desc, - size_t const outer_size, - size_t const inner_size, - std::byte const *& cache_ptr, - dyn_t & output) +template +inline void format_input_handler::parse_info_element_value_type(var_io::value_type_id const id_from_header, + detail::bcf_type_descriptor const desc, + size_t const outer_size, + size_t const inner_size, + std::byte const *& cache_ptr, + dyn_t & output) { // TODO DRY out the boilerplate error messages - if (static_cast(id_from_header) < static_cast(var_io::dynamic_type_id::string) && inner_size != 1) + if (static_cast(id_from_header) < static_cast(var_io::value_type_id::string) && inner_size != 1) error("BCF data field expected exactly one element, got:", inner_size); switch (id_from_header) { - case var_io::dynamic_type_id::char8: + case var_io::value_type_id::char8: { if (desc != detail::bcf_type_descriptor::char8) error("Attempting to create char but the byte descriptor does not indicate char type."); - dynamic_type_init_vector(outer_size, cache_ptr, output); + info_element_value_type_init_vector(outer_size, cache_ptr, output); return; } - case var_io::dynamic_type_id::int8: - case var_io::dynamic_type_id::int16: - case var_io::dynamic_type_id::int32: + case var_io::value_type_id::int8: + case var_io::value_type_id::int16: + case var_io::value_type_id::int32: { switch (desc) { case detail::bcf_type_descriptor::int8: { - dynamic_type_init_vector(outer_size, - cache_ptr, - output); + info_element_value_type_init_vector(outer_size, + cache_ptr, + output); break; } case detail::bcf_type_descriptor::int16: { - dynamic_type_init_vector(outer_size, - cache_ptr, - output); + info_element_value_type_init_vector(outer_size, + cache_ptr, + output); break; } case detail::bcf_type_descriptor::int32: { - dynamic_type_init_vector(outer_size, - cache_ptr, - output); + info_element_value_type_init_vector(outer_size, + cache_ptr, + output); break; } default: @@ -1239,65 +1212,70 @@ inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_i } return; } - case var_io::dynamic_type_id::float32: + case var_io::value_type_id::float32: { if (desc == detail::bcf_type_descriptor::float32) - dynamic_type_init_vector(outer_size, cache_ptr, output); + info_element_value_type_init_vector(outer_size, + cache_ptr, + output); else error("Attempting to create float but the byte descriptor does not indicate float type."); return; } - case var_io::dynamic_type_id::string: + case var_io::value_type_id::string: { if (desc != detail::bcf_type_descriptor::char8) error("Attempting to creates string but the byte descriptor does not indicate string type."); - // TODO double-check if we shouldn't actually call dynamic_type_init_vector_of_vector here instead - dynamic_type_init_vector_of_string(outer_size, cache_ptr, output); + // TODO double-check if we shouldn't actually call info_element_value_type_init_vector_of_vector here + // instead + info_element_value_type_init_vector_of_string(outer_size, + cache_ptr, + output); return; } - case var_io::dynamic_type_id::vector_of_char8: + case var_io::value_type_id::vector_of_char8: { if (desc != detail::bcf_type_descriptor::char8) error("Attempting to create vector of char but the byte descriptor does not indicate char type."); - dynamic_type_init_vector_of_vector(outer_size, - inner_size, - cache_ptr, - output); + info_element_value_type_init_vector_of_vector(outer_size, + inner_size, + cache_ptr, + output); return; } - case var_io::dynamic_type_id::vector_of_int8: - case var_io::dynamic_type_id::vector_of_int16: - case var_io::dynamic_type_id::vector_of_int32: + case var_io::value_type_id::vector_of_int8: + case var_io::value_type_id::vector_of_int16: + case var_io::value_type_id::vector_of_int32: { switch (desc) { case detail::bcf_type_descriptor::int8: { - dynamic_type_init_vector_of_vector( - outer_size, - inner_size, - cache_ptr, - output); + info_element_value_type_init_vector_of_vector(outer_size, + inner_size, + cache_ptr, + output); break; } case detail::bcf_type_descriptor::int16: { - dynamic_type_init_vector_of_vector( - outer_size, - inner_size, - cache_ptr, - output); + info_element_value_type_init_vector_of_vector(outer_size, + inner_size, + cache_ptr, + output); break; } case detail::bcf_type_descriptor::int32: { - dynamic_type_init_vector_of_vector( - outer_size, - inner_size, - cache_ptr, - output); + info_element_value_type_init_vector_of_vector(outer_size, + inner_size, + cache_ptr, + output); break; } default: @@ -1305,14 +1283,15 @@ inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_i } return; } - case var_io::dynamic_type_id::vector_of_float32: + case var_io::value_type_id::vector_of_float32: { if (desc == detail::bcf_type_descriptor::float32) { - dynamic_type_init_vector_of_vector(outer_size, - inner_size, - cache_ptr, - output); + info_element_value_type_init_vector_of_vector( + outer_size, + inner_size, + cache_ptr, + output); break; } else @@ -1321,14 +1300,14 @@ inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_i } return; } - case var_io::dynamic_type_id::vector_of_string: + case var_io::value_type_id::vector_of_string: { if (desc != detail::bcf_type_descriptor::char8) error( "Attempting to create vector of string but the byte descriptor does not indicate char alphabet"); // TODO this definitely needs a test - constexpr size_t id = static_cast(var_io::dynamic_type_id::vector_of_string); + constexpr size_t id = static_cast(var_io::value_type_id::vector_of_string); auto & output_ = output.template emplace(); std::string_view tmp{reinterpret_cast(cache_ptr), outer_size * inner_size}; for (size_t sample = 0; sample < outer_size; ++sample) @@ -1352,9 +1331,9 @@ inline void format_input_handler::parse_dynamic_type(var_io::dynamic_type_i cache_ptr += outer_size * inner_size; return; } - case var_io::dynamic_type_id::flag: + case var_io::value_type_id::flag: { - error("seqan3::var_io::dynamic_vector_type cannot be initialised to flag state."); + error("seqan3::var_io::genotype_element_value_type cannot be initialised to flag state."); return; } } diff --git a/include/bio/format/bcf_output_handler.hpp b/include/bio/format/bcf_output_handler.hpp index ae36195..681ad2d 100644 --- a/include/bio/format/bcf_output_handler.hpp +++ b/include/bio/format/bcf_output_handler.hpp @@ -555,13 +555,7 @@ class format_output_handler : public format_output_handler_base) - record_core.n_fmt = std::ranges::distance(detail::get_first(field)); - else - record_core.n_fmt = detail::range_or_tuple_size(field); - } + void set_core_n_fmt(auto & field) { record_core.n_fmt = detail::range_or_tuple_size(field); } //!\} /*!\name Field writers @@ -651,7 +645,7 @@ class format_output_handler : public format_output_handler_base : public format_output_handler_base : public format_output_handler_baseinfos.at(header->idx_to_info_pos().at(idx)); /* VALUE */ - if constexpr (detail::is_dynamic_type) + if constexpr (detail::is_info_element_value_type) { auto func = [&](auto & param) { write_typed_data(param, get_desc(param, info)); }; std::visit(func, value); @@ -950,15 +944,15 @@ class format_output_handler : public format_output_handler_base) + if constexpr (detail::is_genotype_element_value_type) std::visit(func, value); else func(value); } - //!\brief Overload for GENOTYPES; genotypes_bcf_style. + //!\brief Overload for GENOTYPES. template - requires(detail::genotype_bcf_style_writer_concept>) + requires(detail::genotype_writer_concept>) void write_field(vtag_t /**/, range_t && range) { for (auto && genotype : range) @@ -967,13 +961,12 @@ class format_output_handler : public format_output_handler_base - requires(detail::genotype_bcf_style_writer_concept &&...) + requires(detail::genotype_writer_concept &&...) void write_field(vtag_t /**/, std::tuple & tup) // TODO add const version { auto func = [&](auto &... field) { (write_genotypes_element(field), ...); }; std::apply(func, tup); } - // TODO vcf-style //!\} //!\brief Write the header. diff --git a/include/bio/format/vcf_input_handler.hpp b/include/bio/format/vcf_input_handler.hpp index 5b9d23b..1e1e0ca 100644 --- a/include/bio/format/vcf_input_handler.hpp +++ b/include/bio/format/vcf_input_handler.hpp @@ -33,7 +33,7 @@ #include #include #include -#include + #include #include #include @@ -155,18 +155,18 @@ class format_input_handler : public format_input_handler_base + requires detail::is_info_element_value_type || detail::is_genotype_element_value_type + static void init_element_value_type(var_io::value_type_id const id, t & output); - /*!\brief Create an bio::var_io::dynamic_type from a string and a known bio::var_io::dynamic_type_id. - * \param[in] id ID of the type that shall be read. - * \param[in] input_string The string data to read from. - * \param[out] output The object to store the result in. - * \returns The number of elements stored in the output in case ID is one of the "vector_of_"-types; 1 otherwise. - */ - static size_t parse_dynamic_type(var_io::dynamic_type_id const id, - std::string_view const input_string, - detail::is_dynamic_type auto & output); // implementation after class + // implementation after class + struct parse_element_value_type_fn; + + // implementation after class + static size_t parse_element_value_type(var_io::value_type_id const id, + std::string_view const input_string, + detail::is_info_element_value_type auto & output); //!\brief Parse the CHROM field. Reading chrom as number means getting the index (not converting string to number). void parse_field(vtag_t const & /**/, auto & parsed_field) @@ -301,17 +301,17 @@ class format_input_handler : public format_input_handler_base) + if constexpr (detail::is_info_element_value_type) { - if (header.infos[info_pos].type != var_io::dynamic_type_id::flag || + if (header.infos[info_pos].type != var_io::value_type_id::flag || header.infos[info_pos].number != 0) { error("INFO field \"", key, "\" is not a flag and should come with a value -- but does not."); @@ -387,9 +387,9 @@ class format_input_handler : public format_input_handler_base) + if constexpr (detail::is_info_element_value_type) { - int32_t num_val = parse_dynamic_type(header.infos[info_pos].type, val, parsed_value); + int32_t num_val = parse_element_value_type(header.infos[info_pos].type, val, parsed_value); if (int32_t exp_val = header.infos[info_pos].number; print_warnings && num_val != exp_val && exp_val >= 0) { @@ -427,7 +427,7 @@ class format_input_handler : public format_input_handler_base : public format_input_handler_base - requires detail::genotype_bcf_style_reader_concept> + requires detail::genotype_reader_concept> void parse_field(vtag_t const & /**/, field_t & parsed_field) { using genotype_field_t = std::ranges::range_reference_t; @@ -481,7 +481,7 @@ class format_input_handler : public format_input_handler_base : public format_input_handler_base : public format_input_handler_base const & /**/, - detail::genotypes_vcf_style_reader_concept auto & parsed_field) - { - size_t column_number = file_it->fields.size(); - size_t expected_column_number = header.column_labels.size(); - - if (column_number != expected_column_number) - error("Expected ", expected_column_number, " columns in line but found ", column_number, "."); - - if (column_number <= 8) // there are no genotypes - return; - - auto & [parsed_format, parsed_samples] = parsed_field; - - using string_t = std::remove_cvref_t; - using variant_t = std::remove_cvref_t; - - /* parse formats */ - std::string_view format_names = file_it->fields[8]; - std::vector format_map; // ATTENTION ALWAYS DYNAMICALLY ALLOCATES HERE - for (std::string_view format_name : format_names | detail::eager_split(':')) - { - size_t format_pos = -1; - if (auto it = header.string_to_format_pos().find(format_name); - it == header.string_to_format_pos().end()) // format name was not in header, insert! - { - add_format_to_header(format_name); - format_pos = header.formats.size() - 1; - } - else - { - format_pos = it->second; - } - - parsed_format.push_back(static_cast(format_name)); - format_map.push_back(format_pos); - } - - /* parse samples */ - parsed_samples.resize(column_number - 9); - - size_t sample_num = 0; - for (std::string_view sample : file_it->fields | std::views::drop(9)) - { - auto & parsed_sample = parsed_samples[sample_num++]; - - size_t field_num = 0; - for (std::string_view field : sample | detail::eager_split(':')) - { - variant_t var; - - var_io::dynamic_type_id id = header.formats[format_map[field_num++]].type; - - parse_dynamic_type(id, field, var); - - parsed_sample.push_back(std::move(var)); - } - } - } - //!\brief Overload for parsing the private data. void parse_field(vtag_t const & /**/, var_io::record_private_data & parsed_field) { @@ -643,8 +582,112 @@ class format_input_handler : public format_input_handler_base::parse_dynamic_type. -struct format_input_handler::parse_dynamic_type_fn +// ---------------------------------------------------------------------------- +// out-of-line definitions of some members +// ---------------------------------------------------------------------------- + +/*!\brief Initialise an object of dynamic type to a given ID. + * \tparam t Type of the output + * \param[in] id The ID. + * \param[out] output The object being initialised. + */ +template + requires detail::is_info_element_value_type || detail::is_genotype_element_value_type +inline void format_input_handler::init_element_value_type(var_io::value_type_id const id, t & output) +{ + switch (id) + { + case var_io::value_type_id::char8: + { + constexpr size_t id = static_cast(var_io::value_type_id::char8); + output.template emplace(); + return; + } + case var_io::value_type_id::int8: + { + constexpr size_t id = static_cast(var_io::value_type_id::int8); + output.template emplace(); + return; + } + case var_io::value_type_id::int16: + { + constexpr size_t id = static_cast(var_io::value_type_id::int16); + output.template emplace(); + return; + } + case var_io::value_type_id::int32: + { + constexpr size_t id = static_cast(var_io::value_type_id::int32); + output.template emplace(); + return; + } + case var_io::value_type_id::float32: + { + constexpr size_t id = static_cast(var_io::value_type_id::float32); + output.template emplace(); + return; + } + case var_io::value_type_id::string: + { + constexpr size_t id = static_cast(var_io::value_type_id::string); + output.template emplace(); + return; + } + case var_io::value_type_id::vector_of_char8: + { + constexpr size_t id = static_cast(var_io::value_type_id::vector_of_char8); + output.template emplace(); + return; + } + case var_io::value_type_id::vector_of_int8: + { + constexpr size_t id = static_cast(var_io::value_type_id::vector_of_int8); + output.template emplace(); + return; + } + case var_io::value_type_id::vector_of_int16: + { + constexpr size_t id = static_cast(var_io::value_type_id::vector_of_int16); + output.template emplace(); + return; + } + case var_io::value_type_id::vector_of_int32: + { + constexpr size_t id = static_cast(var_io::value_type_id::vector_of_int32); + output.template emplace(); + return; + } + case var_io::value_type_id::vector_of_float32: + { + constexpr size_t id = static_cast(var_io::value_type_id::vector_of_float32); + output.template emplace(); + return; + } + case var_io::value_type_id::vector_of_string: + { + constexpr size_t id = static_cast(var_io::value_type_id::vector_of_string); + output.template emplace(); + return; + } + case var_io::value_type_id::flag: + { + if constexpr (detail::is_genotype_element_value_type) + { + throw std::logic_error{ + "bio::var_io::genotype_element_value_type cannot be initialised to flag state."}; + } + else + { + constexpr size_t id = static_cast(var_io::value_type_id::flag); + output.template emplace(); + } + return; + } + } +} + +//!\brief Visitor definition for format_input_handler::parse_element_value_type. +struct format_input_handler::parse_element_value_type_fn { //!\brief The input data. std::string_view input; @@ -708,7 +751,7 @@ struct format_input_handler::parse_dynamic_type_fn for (std::string_view const s : input | detail::eager_split(',')) { vec.emplace_back(); - parse_dynamic_type_fn{s}(vec.back()); + parse_element_value_type_fn{s}(vec.back()); } } @@ -716,13 +759,19 @@ struct format_input_handler::parse_dynamic_type_fn } }; -template -inline size_t format_input_handler::parse_dynamic_type(var_io::dynamic_type_id const id, - std::string_view const input_string, - output_t & output) +/*!\brief Parse text input into a bio::var_io::info_element_value_type / bio::var_io::genotype_element_value_type. + * \param[in] id ID of the type that shall be read. + * \param[in] input_string The string data to read from. + * \param[out] output The object to store the result into. + * \returns The number of elements stored in the output in case ID is one of the "vector_of_"-types; 1 otherwise. + */ +template +inline size_t format_input_handler::parse_element_value_type(var_io::value_type_id const id, + std::string_view const input_string, + output_t & output) { - detail::init_dynamic_type(id, output); - return std::visit(parse_dynamic_type_fn{input_string}, output); + init_element_value_type(id, output); + return std::visit(parse_element_value_type_fn{input_string}, output); } } // namespace bio diff --git a/include/bio/format/vcf_output_handler.hpp b/include/bio/format/vcf_output_handler.hpp index 5192485..efff6e9 100644 --- a/include/bio/format/vcf_output_handler.hpp +++ b/include/bio/format/vcf_output_handler.hpp @@ -99,10 +99,10 @@ class format_output_handler : public format_output_handler_base decltype(auto) { return detail::get_second(pair); }); - //!\brief Get the size of a range or dynamic_vector_type. + //!\brief Get the size of a range or genotype_element_value_type. static size_t dyn_vec_size(auto & in) { - if constexpr (detail::is_dynamic_vector_type>) + if constexpr (detail::is_genotype_element_value_type>) return std::visit([](auto & val) { return std::ranges::size(val); }, in); else return std::ranges::size(in); @@ -180,18 +180,18 @@ class format_output_handler : public format_output_handler_base>) + if constexpr (detail::is_info_element_value_type>) std::visit(visitor, var); else visitor(var); } //!\brief Write variant or a type that is given inplace of a variant; possibly verify. - void write_variant(auto const & var, var_io::dynamic_type_id const type_id) + void write_variant(auto const & var, var_io::value_type_id const type_id) { - if constexpr (detail::is_dynamic_type>) + if constexpr (detail::is_info_element_value_type>) { - if (!detail::type_id_is_compatible(type_id, var_io::dynamic_type_id{var.index()})) + if (!detail::type_id_is_compatible(type_id, var_io::value_type_id{var.index()})) throw format_error{"The variant was not in the proper state."}; // TODO improve text } else @@ -246,7 +246,7 @@ class format_output_handler : public format_output_handler_base>) + if constexpr (detail::is_genotype_element_value_type>) std::visit(visitor, var); else visitor(var); @@ -292,9 +292,9 @@ class format_output_handler : public format_output_handler_basestring_to_info_pos().at(key); - var_io::dynamic_type_id type_id = header->infos[pos].type; + var_io::value_type_id type_id = header->infos[pos].type; - if (type_id != var_io::dynamic_type_id::flag) // all fields that aren't flags have second part + if (type_id != var_io::value_type_id::flag) // all fields that aren't flags have second part { it = '='; write_variant(val, type_id); @@ -381,9 +381,9 @@ class format_output_handler : public format_output_handler_base - requires(detail::genotype_bcf_style_writer_concept>) + requires(detail::genotype_writer_concept>) void write_field(vtag_t /**/, range_t & range) { if (header->column_labels.size() <= 8) @@ -416,9 +416,9 @@ class format_output_handler : public format_output_handler_base - requires(detail::genotype_bcf_style_writer_concept> &&...) + requires(detail::genotype_writer_concept> &&...) void write_field(vtag_t /**/, std::tuple & tup) { if (header->column_labels.size() <= 8) @@ -454,28 +454,6 @@ class format_output_handler : public format_output_handler_base /**/, detail::genotypes_vcf_style_writer_concept auto & field) - { - if (header->column_labels.size() <= 8) - return; - - auto & [format, samples] = field; - - /* format field */ - write_delimited(format, ':'); - it = '\t'; - - if (header->column_labels.size() <= 9) - return; - - /* samples */ - // functional programming for the win! [this works for tuples and ranges!] - auto write_var = [&](auto const & var) { write_variant(var); }; - auto write_sample = [&](auto const & sample) { write_delimited(sample, ':', write_var); }; - write_delimited(samples, '\t', write_sample); - } //!\} //!\brief Write the header. diff --git a/include/bio/var_io/all.hpp b/include/bio/var_io/all.hpp index d5322f5..b0e0e01 100644 --- a/include/bio/var_io/all.hpp +++ b/include/bio/var_io/all.hpp @@ -18,6 +18,12 @@ /*!\defgroup var_io Var I/O * \ingroup bio * \brief Reader and writer for variant files. + * + * This module provides high-level APIs to read and write VCF and BCF files. + * + * To read files, have a look at bio::var_io::reader and to write files have a look at bio::var_io::writer. + * + * */ /*!\namespace bio::var_io diff --git a/include/bio/var_io/dynamic_type.hpp b/include/bio/var_io/dynamic_type.hpp deleted file mode 100644 index 85f6e7d..0000000 --- a/include/bio/var_io/dynamic_type.hpp +++ /dev/null @@ -1,314 +0,0 @@ -// ----------------------------------------------------------------------------------------------------- -// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin -// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik -// Copyright (c) 2020-2021, deCODE Genetics -// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License -// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md -// ----------------------------------------------------------------------------------------------------- - -/*!\file - * \brief Provides the "dynamic typing" and auxiliaries for variant IO. - * \author Hannes Hauswedell - */ - -#pragma once - -#include -#include -#include -#include - -#include - -#include -#include - -namespace bio::var_io -{ - -//!\brief Enumerator to ease "dynamic typing" in variant IO. -//!\ingroup var_io -enum class dynamic_type_id : size_t -{ - char8, - int8, - int16, - int32, - float32, - string, - vector_of_char8, - vector_of_int8, - vector_of_int16, - vector_of_int32, - vector_of_float32, - vector_of_string, - flag -}; - -/*!\brief Variant to handle "dynamic typing" in variant IO. - * \ingroup var_io - * \details - * - * This variant is used to hold values for the INFO field in VCF and BCF, and for the GENOTYPES - * field in VCF. - * Since the type of such fields is determined at run-time (depends on values in header), variables - * of "dynamic type" can be set to different types at run-time. - */ -template -using dynamic_type = - std::variant, - std::vector, - std::vector, - std::vector, - std::vector, - std::vector, - std::vector>, - bool>; - -/*!\brief Variant to handle "dynamic typing" in variant IO. - * \ingroup var_io - * \details - * - * This type is similar to bio::var_io::dynamic_type except that it encodes a range of the respective types. - * It is used only to encode the genotype field in BCF files. - * - * It does not contain an entry for bio::var_io::dynamic_type_id::flag, because flags cannot appear in - * the genotype field. - */ -template -using dynamic_vector_type = - std::variant, - std::vector, - std::vector, - std::vector, - std::vector, - std::vector>, - std::vector>, - std::vector>, - std::vector>, - std::vector>, - std::vector>, - std::vector>> - /* no flag here */>; - -} // namespace bio::var_io - -namespace seqan3 -{ - -//!\brief TODO implement me properly -template -inline debug_stream_type & operator<<(debug_stream_type & s, bio::var_io::dynamic_type_id const & id) -{ - // TODO print nice string - s << (size_t)id; - return s; -} - -} // namespace seqan3 - -namespace bio::detail -{ - -/*!\addtogroup var_io - * \{ - */ - -//!\brief in* and vector_of_int* are each are "compatible" with each other; the rest only with self. -constexpr bool type_id_is_compatible(var_io::dynamic_type_id const lhs, var_io::dynamic_type_id const rhs) -{ - switch (lhs) - { - case var_io::dynamic_type_id::int8: - case var_io::dynamic_type_id::int16: - case var_io::dynamic_type_id::int32: - switch (rhs) - { - case var_io::dynamic_type_id::int8: - case var_io::dynamic_type_id::int16: - case var_io::dynamic_type_id::int32: - return true; - default: - return false; - }; - break; - case var_io::dynamic_type_id::vector_of_int8: - case var_io::dynamic_type_id::vector_of_int16: - case var_io::dynamic_type_id::vector_of_int32: - switch (rhs) - { - case var_io::dynamic_type_id::vector_of_int8: - case var_io::dynamic_type_id::vector_of_int16: - case var_io::dynamic_type_id::vector_of_int32: - return true; - default: - return false; - }; - break; - default: - return lhs == rhs; - } -} - -//!\brief Auxilliary concept that encompasses bio::var_io::dynamic_type. -template -concept is_dynamic_type = one_of, var_io::dynamic_type>; - -//!\brief Auxilliary concept that encompasses bio::var_io::dynamic_vector_type. -template -concept is_dynamic_vector_type = - one_of, var_io::dynamic_vector_type>; - -template -concept var_io_legal_type_aux = - std::same_as || std::signed_integral || std::floating_point || std::same_as < std::decay_t, -char const * > ; - -/*!\interface bio::detail::var_io_legal_type <> - * \tparam t The type to check. - * \brief A type that is similar to one of the alternatives of bio::var_io::dynamic_type - */ -//!\cond CONCEPT_DEF -template -concept var_io_legal_type = var_io_legal_type_aux> || std::same_as || - (std::ranges::forward_range && (var_io_legal_type_aux>> || - (std::ranges::forward_range> && - std::same_as const &>))); -//!\endcond - -/*!\interface bio::detail::var_io_legal_vector_type <> - * \tparam t The type to check. - * \brief A type that is similar to one of the alternatives of bio::var_io::dynamic_type - */ -//!\cond CONCEPT_DEF -template -concept var_io_legal_vector_type = - std::ranges::forward_range && var_io_legal_type> && - !std::same_as>; -//!\endcond - -/*!\interface bio::detail::var_io_legal_or_dynamic <> - * \tparam t The type to check. - * \brief A type that is similar to one of the alternatives of bio::var_io::dynamic_type - */ -//!\cond CONCEPT_DEF -template -concept var_io_legal_or_dynamic = var_io_legal_type || is_dynamic_type; -//!\endcond - -/*!\interface bio::detail::var_io_vector_legal_or_dynamic <> - * \tparam t The type to check. - * \brief A type that is similar to one of the alternatives of bio::var_io::dynamic_type - */ -//!\cond CONCEPT_DEF -template -concept var_io_vector_legal_or_dynamic = var_io_legal_vector_type || is_dynamic_vector_type; -//!\endcond - -/*!\brief Initialise an object of dynamic type to a given ID. - * \tparam t Type of the output - * \param[in] id The ID. - * \param[out] output The object being initialised. - */ -template - requires is_dynamic_type || is_dynamic_vector_type -inline void init_dynamic_type(var_io::dynamic_type_id const id, t & output) -{ - switch (id) - { - case var_io::dynamic_type_id::char8: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::char8); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::int8: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::int8); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::int16: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::int16); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::int32: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::int32); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::float32: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::float32); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::string: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::string); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::vector_of_char8: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::vector_of_char8); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::vector_of_int8: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::vector_of_int8); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::vector_of_int16: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::vector_of_int16); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::vector_of_int32: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::vector_of_int32); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::vector_of_float32: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::vector_of_float32); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::vector_of_string: - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::vector_of_string); - output.template emplace(); - return; - } - case var_io::dynamic_type_id::flag: - { - if constexpr (is_dynamic_vector_type) - { - throw std::logic_error{"bio::var_io::dynamic_vector_type cannot be initialised to flag state."}; - } - else - { - constexpr size_t id = static_cast(var_io::dynamic_type_id::flag); - output.template emplace(); - } - return; - } - } -} - -//!\} - -} // namespace bio::detail diff --git a/include/bio/var_io/header.hpp b/include/bio/var_io/header.hpp index 93b8e0a..a563c24 100644 --- a/include/bio/var_io/header.hpp +++ b/include/bio/var_io/header.hpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include namespace bio::var_io { @@ -100,12 +100,12 @@ class header //!\brief Type of a INFO field header line. struct info_t { - std::string id; //!< The ID. - int32_t number{}; //!< Number of values, see also bio::var_io::header_number. - dynamic_type_id type{}; //!< Type of the field. - std::string description{}; //!< Description. - other_fields_t other_fields{}; //!< Other entries. - int32_t idx = -1; //!< The numeric ID. + std::string id; //!< The ID. + int32_t number{}; //!< Number of values, see also bio::var_io::header_number. + value_type_id type{}; //!< Type of the field. + std::string description{}; //!< Description. + other_fields_t other_fields{}; //!< Other entries. + int32_t idx = -1; //!< The numeric ID. //!\brief Defaulted three-way comparisons. auto operator<=>(info_t const &) const = default; @@ -558,29 +558,29 @@ class header return raw_data; } - //!\brief Turn bio::dynamic_type_id into string. - static std::string unparse_type(dynamic_type_id const id) + //!\brief Turn bio::value_type_id into string. + static std::string unparse_type(value_type_id const id) { // TODO replace with string_view switch (id) { - case dynamic_type_id::int8: - case dynamic_type_id::vector_of_int8: - case dynamic_type_id::int16: - case dynamic_type_id::vector_of_int16: - case dynamic_type_id::int32: - case dynamic_type_id::vector_of_int32: + case value_type_id::int8: + case value_type_id::vector_of_int8: + case value_type_id::int16: + case value_type_id::vector_of_int16: + case value_type_id::int32: + case value_type_id::vector_of_int32: return "Integer"; - case dynamic_type_id::float32: - case dynamic_type_id::vector_of_float32: + case value_type_id::float32: + case value_type_id::vector_of_float32: return "Float"; - case dynamic_type_id::char8: - case dynamic_type_id::vector_of_char8: + case value_type_id::char8: + case value_type_id::vector_of_char8: return "Character"; - case dynamic_type_id::string: - case dynamic_type_id::vector_of_string: + case value_type_id::string: + case value_type_id::vector_of_string: return "String"; - case dynamic_type_id::flag: + case value_type_id::flag: return "Flag"; default: throw format_error{"Illegal type in INFO or FILTER header line."}; @@ -675,15 +675,15 @@ class header { switch (new_entry.type) { - case dynamic_type_id::int8: - case dynamic_type_id::int16: - case dynamic_type_id::int32: - new_entry.type = dynamic_type_id::int8; + case value_type_id::int8: + case value_type_id::int16: + case value_type_id::int32: + new_entry.type = value_type_id::int8; break; - case dynamic_type_id::vector_of_int8: - case dynamic_type_id::vector_of_int16: - case dynamic_type_id::vector_of_int32: - new_entry.type = dynamic_type_id::vector_of_int8; + case value_type_id::vector_of_int8: + case value_type_id::vector_of_int16: + case value_type_id::vector_of_int32: + new_entry.type = value_type_id::vector_of_int8; break; default: break; @@ -693,15 +693,15 @@ class header { switch (new_entry.type) { - case dynamic_type_id::int8: - case dynamic_type_id::int16: - case dynamic_type_id::int32: - new_entry.type = dynamic_type_id::int16; + case value_type_id::int8: + case value_type_id::int16: + case value_type_id::int32: + new_entry.type = value_type_id::int16; break; - case dynamic_type_id::vector_of_int8: - case dynamic_type_id::vector_of_int16: - case dynamic_type_id::vector_of_int32: - new_entry.type = dynamic_type_id::vector_of_int16; + case value_type_id::vector_of_int8: + case value_type_id::vector_of_int16: + case value_type_id::vector_of_int32: + new_entry.type = value_type_id::vector_of_int16; break; default: break; @@ -863,18 +863,18 @@ class header return header_number::dot; } - /*!\brief Turn a string into bio::var_io::dynamic_type_id. + /*!\brief Turn a string into bio::var_io::value_type_id. * \param[in] in The input string. * \param[in] number The accompanying number value from the header. * \return The dynamic type id. */ - static dynamic_type_id parse_type(std::string_view const in, int32_t const number) + static value_type_id parse_type(std::string_view const in, int32_t const number) { - dynamic_type_id ret{}; + value_type_id ret{}; if (in == "Flag") { - ret = dynamic_type_id::flag; + ret = value_type_id::flag; if (number != 0) throw format_error{std::string{"Flags must always have number 0 in header."}}; return ret; @@ -886,30 +886,30 @@ class header if (in == "Integer") { if (number == 1) - ret = dynamic_type_id::int32; + ret = value_type_id::int32; else - ret = dynamic_type_id::vector_of_int32; + ret = value_type_id::vector_of_int32; } else if (in == "Float") { if (number == 1) - ret = dynamic_type_id::float32; + ret = value_type_id::float32; else - ret = dynamic_type_id::vector_of_float32; + ret = value_type_id::vector_of_float32; } else if (in == "Character") { if (number == 1) - ret = dynamic_type_id::char8; + ret = value_type_id::char8; else - ret = dynamic_type_id::vector_of_char8; + ret = value_type_id::vector_of_char8; } else if (in == "String") { if (number == 1) - ret = dynamic_type_id::string; + ret = value_type_id::string; else - ret = dynamic_type_id::vector_of_string; + ret = value_type_id::vector_of_string; } else { @@ -950,27 +950,27 @@ class header //!\brief A table of reserved INFO entries. inline std::unordered_map const reserved_infos = { - {"AA", {"AA", 1, dynamic_type_id::string, "\"Ancestral allele\""}}, - {"AC", {"AC", header_number::A, dynamic_type_id::vector_of_int32, "\"Allele count in genotypes, for each ALT allele, in the same order as listed\""}}, - {"AD", {"AD", header_number::R, dynamic_type_id::vector_of_int32, "\"Total read depth for each allele\""}}, - {"ADF", {"ADF", header_number::R, dynamic_type_id::vector_of_int32, "\"Read depth for each allele on the forward strand\""}}, - {"ADR", {"ADR", header_number::R, dynamic_type_id::vector_of_int32, "\"Read depth for each allele on the reverse strand\""}}, - {"AF", {"AF", header_number::A, dynamic_type_id::vector_of_float32, "\"Allele frequency for each ALT allele in the same order as listed\""}}, - {"AN", {"AN", 1, dynamic_type_id::int32, "\"Total number of alleles in called genotypes\""}}, - {"BQ", {"BQ", 1, dynamic_type_id::float32, "\"RMS base quality\""}}, - {"CIGAR", {"CIGAR", header_number::A, dynamic_type_id::vector_of_string, "\"Cigar string describing how to align an alternate allele to the reference allele\""}}, - {"DB", {"DB", 0, dynamic_type_id::flag, "\"dbSNP membership\""}}, - {"DP", {"DP", 1, dynamic_type_id::int32, "\"Combined depth across samples\""}}, - {"END", {"END", 1, dynamic_type_id::int32, "\"End position on CHROM (used with symbolic alleles; see below)\""}}, - {"H2", {"H2", 0, dynamic_type_id::flag, "\"HapMap2 membership\""}}, - {"H3", {"H3", 0, dynamic_type_id::flag, "\"HapMap3 membership\""}}, - {"MQ", {"MQ", 1, dynamic_type_id::float32, "\"RMS mapping quality\""}}, - {"MQ0", {"MQ0", 1, dynamic_type_id::int32, "\"Number of MAPQ == 0 reads\""}}, - {"NS", {"NS", 1, dynamic_type_id::int32, "\"Number of samples with data\""}}, - {"SB", {"SB", 4, dynamic_type_id::vector_of_int32, "\"Strand bias\""}}, - {"SOMATIC", {"SOMATIC", 0, dynamic_type_id::flag, "\"Somatic mutation (for cancer genomics)\""}}, - {"VALIDATED",{"VALIDATED", 0, dynamic_type_id::flag, "\"Validated by follow-up experiment\""}}, - {"1000G", {"1000G", 0, dynamic_type_id::flag, "\"1000 Genomes membership\""}} + {"AA", {"AA", 1, value_type_id::string, "\"Ancestral allele\""}}, + {"AC", {"AC", header_number::A, value_type_id::vector_of_int32, "\"Allele count in genotypes, for each ALT allele, in the same order as listed\""}}, + {"AD", {"AD", header_number::R, value_type_id::vector_of_int32, "\"Total read depth for each allele\""}}, + {"ADF", {"ADF", header_number::R, value_type_id::vector_of_int32, "\"Read depth for each allele on the forward strand\""}}, + {"ADR", {"ADR", header_number::R, value_type_id::vector_of_int32, "\"Read depth for each allele on the reverse strand\""}}, + {"AF", {"AF", header_number::A, value_type_id::vector_of_float32, "\"Allele frequency for each ALT allele in the same order as listed\""}}, + {"AN", {"AN", 1, value_type_id::int32, "\"Total number of alleles in called genotypes\""}}, + {"BQ", {"BQ", 1, value_type_id::float32, "\"RMS base quality\""}}, + {"CIGAR", {"CIGAR", header_number::A, value_type_id::vector_of_string, "\"Cigar string describing how to align an alternate allele to the reference allele\""}}, + {"DB", {"DB", 0, value_type_id::flag, "\"dbSNP membership\""}}, + {"DP", {"DP", 1, value_type_id::int32, "\"Combined depth across samples\""}}, + {"END", {"END", 1, value_type_id::int32, "\"End position on CHROM (used with symbolic alleles; see below)\""}}, + {"H2", {"H2", 0, value_type_id::flag, "\"HapMap2 membership\""}}, + {"H3", {"H3", 0, value_type_id::flag, "\"HapMap3 membership\""}}, + {"MQ", {"MQ", 1, value_type_id::float32, "\"RMS mapping quality\""}}, + {"MQ0", {"MQ0", 1, value_type_id::int32, "\"Number of MAPQ == 0 reads\""}}, + {"NS", {"NS", 1, value_type_id::int32, "\"Number of samples with data\""}}, + {"SB", {"SB", 4, value_type_id::vector_of_int32, "\"Strand bias\""}}, + {"SOMATIC", {"SOMATIC", 0, value_type_id::flag, "\"Somatic mutation (for cancer genomics)\""}}, + {"VALIDATED",{"VALIDATED", 0, value_type_id::flag, "\"Validated by follow-up experiment\""}}, + {"1000G", {"1000G", 0, value_type_id::flag, "\"1000 Genomes membership\""}} }; // clang-format on @@ -978,22 +978,22 @@ inline std::unordered_map const reserved_infos //!\brief A table of reserved FORMAT entries. inline std::unordered_map const reserved_formats = { - {"AD", {"AD", header_number::R, dynamic_type_id::vector_of_int32, "\"Read depth for each allele\""}}, - {"ADF", {"ADF", header_number::R, dynamic_type_id::vector_of_int32, "\"Read depth for each allele on the forward strand\""}}, - {"ADR", {"ADR", header_number::R, dynamic_type_id::vector_of_int32, "\"Read depth for each allele on the reverse strand\""}}, - {"DP", {"DP", 1, dynamic_type_id::int32, "\"Read depth\""}}, - {"EC", {"EC", header_number::A, dynamic_type_id::vector_of_int32, "\"Expected alternate allele counts\""}}, - {"FT", {"FT", 1, dynamic_type_id::string, "\"Filter indicating if this genotype was “called”\""}}, - {"GL", {"GL", header_number::G, dynamic_type_id::vector_of_float32, "\"Genotype likelihoods\""}}, - {"GP", {"GP", header_number::G, dynamic_type_id::vector_of_float32, "\"Genotype posterior probabilities\""}}, - {"GQ", {"GQ", 1, dynamic_type_id::int32, "\"Conditional genotype quality\""}}, - {"GT", {"GT", 1, dynamic_type_id::string, "\"Genotype\""}}, - {"HQ", {"HQ", 2, dynamic_type_id::vector_of_int32, "\"Haplotype quality\""}}, - {"MQ", {"MQ", 1, dynamic_type_id::int32, "\"RMS mapping quality\""}}, - {"PL", {"PL", header_number::G, dynamic_type_id::vector_of_int32, "\"Phred-scaled genotype likelihoods rounded to the closest integer\""}}, - {"PP", {"PP", header_number::G, dynamic_type_id::vector_of_int32, "\"Phred-scaled genotype posterior probabilities rounded to the closest integer\""}}, - {"PQ", {"PQ", 1, dynamic_type_id::int32, "\"Phasing quality\""}}, - {"PS", {"PS", 1, dynamic_type_id::int32, "\"Phase set\""}} + {"AD", {"AD", header_number::R, value_type_id::vector_of_int32, "\"Read depth for each allele\""}}, + {"ADF", {"ADF", header_number::R, value_type_id::vector_of_int32, "\"Read depth for each allele on the forward strand\""}}, + {"ADR", {"ADR", header_number::R, value_type_id::vector_of_int32, "\"Read depth for each allele on the reverse strand\""}}, + {"DP", {"DP", 1, value_type_id::int32, "\"Read depth\""}}, + {"EC", {"EC", header_number::A, value_type_id::vector_of_int32, "\"Expected alternate allele counts\""}}, + {"FT", {"FT", 1, value_type_id::string, "\"Filter indicating if this genotype was “called”\""}}, + {"GL", {"GL", header_number::G, value_type_id::vector_of_float32, "\"Genotype likelihoods\""}}, + {"GP", {"GP", header_number::G, value_type_id::vector_of_float32, "\"Genotype posterior probabilities\""}}, + {"GQ", {"GQ", 1, value_type_id::int32, "\"Conditional genotype quality\""}}, + {"GT", {"GT", 1, value_type_id::string, "\"Genotype\""}}, + {"HQ", {"HQ", 2, value_type_id::vector_of_int32, "\"Haplotype quality\""}}, + {"MQ", {"MQ", 1, value_type_id::int32, "\"RMS mapping quality\""}}, + {"PL", {"PL", header_number::G, value_type_id::vector_of_int32, "\"Phred-scaled genotype likelihoods rounded to the closest integer\""}}, + {"PP", {"PP", header_number::G, value_type_id::vector_of_int32, "\"Phred-scaled genotype posterior probabilities rounded to the closest integer\""}}, + {"PQ", {"PQ", 1, value_type_id::int32, "\"Phasing quality\""}}, + {"PS", {"PS", 1, value_type_id::int32, "\"Phase set\""}} }; // clang-format on diff --git a/include/bio/var_io/misc.hpp b/include/bio/var_io/misc.hpp index 5fd7e1d..db7fdba 100644 --- a/include/bio/var_io/misc.hpp +++ b/include/bio/var_io/misc.hpp @@ -26,12 +26,20 @@ #include #include #include -#include -#include namespace bio::var_io { +//----------------------------------------------------------------------------- +// forwards +//----------------------------------------------------------------------------- + +class header; + +//----------------------------------------------------------------------------- +// missing_value +//----------------------------------------------------------------------------- + /*!\addtogroup var_io * \{ */ @@ -70,6 +78,10 @@ inline float missing_value = []() //!\} } // namespace bio::var_io +//----------------------------------------------------------------------------- +// end_of_vector +//----------------------------------------------------------------------------- + namespace bio::detail { /*!\addtogroup var_io @@ -114,9 +126,132 @@ namespace bio::var_io { //----------------------------------------------------------------------------- -// Helper classes for field types +// value_type_id //----------------------------------------------------------------------------- +//!\brief Enumerator to ease "dynamic typing" in variant IO. +//!\ingroup var_io +enum class value_type_id : size_t +{ + char8, + int8, + int16, + int32, + float32, + string, + vector_of_char8, + vector_of_int8, + vector_of_int16, + vector_of_int32, + vector_of_float32, + vector_of_string, + flag +}; + +} // namespace bio::var_io + +namespace seqan3 +{ + +//!\brief TODO implement me properly +template +inline debug_stream_type & operator<<(debug_stream_type & s, bio::var_io::value_type_id const & id) +{ + // TODO print nice string + s << (size_t)id; + return s; +} + +} // namespace seqan3 + +namespace bio::detail +{ + +//!\brief int* and vector_of_int* are each are "compatible" with each other; the rest only with self. +//!\ingroup var_io +constexpr bool type_id_is_compatible(var_io::value_type_id const lhs, var_io::value_type_id const rhs) +{ + switch (lhs) + { + case var_io::value_type_id::int8: + case var_io::value_type_id::int16: + case var_io::value_type_id::int32: + switch (rhs) + { + case var_io::value_type_id::int8: + case var_io::value_type_id::int16: + case var_io::value_type_id::int32: + return true; + default: + return false; + }; + break; + case var_io::value_type_id::vector_of_int8: + case var_io::value_type_id::vector_of_int16: + case var_io::value_type_id::vector_of_int32: + switch (rhs) + { + case var_io::value_type_id::vector_of_int8: + case var_io::value_type_id::vector_of_int16: + case var_io::value_type_id::vector_of_int32: + return true; + default: + return false; + }; + break; + default: + return lhs == rhs; + } +} + +} // namespace bio::detail + +namespace bio::var_io +{ + +//----------------------------------------------------------------------------- +// The info element +//----------------------------------------------------------------------------- + +/*!\brief Variant to handle "dynamic typing" in Var I/O INFO fields. + * \ingroup var_io + * \details + * + * This variant can hold values for the INFO field. + * Since the type of such fields may only be determined at run-time (depends on values in header), variables + * of this type can be set to different types at run-time. + */ +template +using info_element_value_type = + std::variant, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector>, + bool>; + +} // namespace bio::var_io + +namespace bio::detail +{ +//!\brief Auxilliary concept that encompasses bio::var_io::info_element_value_type. +//!\ingroup var_io +template +concept is_info_element_value_type = + one_of, var_io::info_element_value_type>; + +} // namespace bio::detail + +namespace bio::var_io +{ + /*!\brief The type of elements in an INFO field. [default] * \ingroup var_io * \tparam own Ownership of the type; see bio::ownership. @@ -128,9 +263,9 @@ struct info_element using string_t = std::conditional_t; //!\brief The ID of the element (as a string or string_view). - string_t id; + string_t id; //!\brief The value of the element. - dynamic_type value; + info_element_value_type value; //!\brief Defaulted three-way comparisons. auto operator<=>(info_element const &) const = default; @@ -144,14 +279,60 @@ template struct info_element_bcf { //!\brief The IDX of the element (index of that descriptor in the header). - int32_t idx; + int32_t idx; //!\brief The value of the element. - dynamic_type value; + info_element_value_type value; //!\brief Defaulted three-way comparisons. auto operator<=>(info_element_bcf const &) const = default; }; +//----------------------------------------------------------------------------- +// The genotype element +//----------------------------------------------------------------------------- + +/*!\brief Variant to handle "dynamic typing" in Var I/O GENOTYPE fields. + * \ingroup var_io + * \details + * + * This type is similar to bio::var_io::info_element_value_type except that it encodes a range of the respective value. + * + * It does not contain an entry for bio::var_io::value_type_id::flag, because flags cannot appear in + * the genotype field. + */ +template +using genotype_element_value_type = + std::variant, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector>, + std::vector>, + std::vector>, + std::vector>, + std::vector>, + std::vector>, + std::vector>> + /* no flag here */>; + +} // namespace bio::var_io + +namespace bio::detail +{ + +//!\brief Auxilliary concept that encompasses bio::var_io::genotype_element_value_type. +//!\ingroup var_io +template +concept is_genotype_element_value_type = one_of, + var_io::genotype_element_value_type>; + +} // namespace bio::detail + +namespace bio::var_io +{ + /*!\brief A type representing an element in the GENOTYPES field. * \ingroup var_io * @@ -167,8 +348,8 @@ struct info_element_bcf * * 0 -- if the field is missing from all samples. * * The variant vector is guaranteed to be over the type defined in the header. Note that this is a vector over such - * types (one element per sample!), so bio::var_io::dynamic_type_id::vector_of_int32 corresponds to - * std::vector>. See bio::var_io::dynamic_vector_type for more details. + * types (one element per sample!), so bio::var_io::value_type_id::vector_of_int32 corresponds to + * std::vector>. See bio::var_io::genotype_element_value_type for more details. * * If fields are missing from some samples but not others, the vector will have full size but the respective values * will be set to the missing value (see bio::var_io::missing_value) or be the empty vector (in case the element type @@ -181,9 +362,9 @@ struct genotype_element using string_t = std::conditional_t; //!\brief The ID of the element (as a string or string_view). - string_t id; + string_t id; //!\brief The value of the element. - dynamic_vector_type value; + genotype_element_value_type value; //!\brief Defaulted three-way comparisons. auto operator<=>(genotype_element const &) const = default; @@ -200,45 +381,14 @@ template struct genotype_element_bcf { //!\brief The IDX of the element (index of that descriptor in the header). - int32_t idx; + int32_t idx; //!\brief The value of the element. - dynamic_vector_type value; + genotype_element_value_type value; //!\brief Defaulted three-way comparisons. auto operator<=>(genotype_element_bcf const &) const = default; }; -/*!\brief A type representing the FORMATS column and all sample columns in VCF-style. - * \ingroup var_io - * - * \details - * - * This type can be used as the field-type for the GENOTYPES field as an alternative to a range of - * bio::var_io::genotype_element. - * - * It uses the data layout as it appears in a VCF file with the FORMAT strings in one member and a vector of "samples". - * Each element of that vector represents a single sample column and is implemented as a vector of values of - * dynamic type (see bio::var_io::dynamic_type). - * - * **This data layout is not recommended, because it is almost always slower.** - * Use it only, if you know that the user will never read or write BCF and if you do very little processing of the - * sample values. - */ -template -struct genotypes_vcf -{ - //!\brief Type of the format strings. - using string_t = std::conditional_t; - - //!\brief The FORMAT strings. - std::vector format_strings; - //!\brief The sample columns. - std::vector>> samples; - - //!\brief Defaulted three-way comparisons. - auto operator<=>(genotypes_vcf const &) const = default; -}; - //!\brief A datastructure that contains private data of variant IO records. //!\ingroup var_io struct record_private_data @@ -286,8 +436,8 @@ inline constinit auto default_field_ids = vtag = std::vector>, // field::genotypes, record_private_data>; // field::_private -/*!\brief Alternative set of field types (VCF-style, shallow). - *!\ingroup var_io - * - * \details - * - * See bio::var_io::reader_options for when and why to choose these field types. - */ -template -inline constinit auto field_types_vcf_style = - ttag), // field::ref, - std::vector, // field::alt, - float, // field::qual, - std::vector, // field::filter, - std::vector>, // field::info, - genotypes_vcf, // field::genotypes, - record_private_data>; // field::_private>; - -/*!\brief Alternative set of field types (BCF-style, deep). - *!\ingroup var_io - * - * \details - * - * See bio::var_io::reader_options for when and why to choose these field types. - */ -template <> -inline constinit auto field_types_vcf_style = - ttag, // field::ref - std::vector, // field::alt - float, // field::qual - std::vector, // field::filter - std::vector>, // field::info, - genotypes_vcf, // field::genotypes - record_private_data>; // field::_private - //!\brief Every field is configured as a std::span of std::byte (this enables "raw" io). //!\ingroup var_io inline constinit auto field_types_raw = @@ -541,28 +651,28 @@ bool type_descriptor_is_int(bcf_type_descriptor const type_desc) } } -//!\brief Convert from bio::var_io::dynamic_type_id to bio::detail::bcf_type_descriptor. -bcf_type_descriptor dynamic_type_id_2_type_descriptor(var_io::dynamic_type_id const type_id) +//!\brief Convert from bio::var_io::value_type_id to bio::detail::bcf_type_descriptor. +bcf_type_descriptor value_type_id_2_type_descriptor(var_io::value_type_id const type_id) { switch (type_id) { - case var_io::dynamic_type_id::char8: - case var_io::dynamic_type_id::vector_of_char8: - case var_io::dynamic_type_id::string: - case var_io::dynamic_type_id::vector_of_string: + case var_io::value_type_id::char8: + case var_io::value_type_id::vector_of_char8: + case var_io::value_type_id::string: + case var_io::value_type_id::vector_of_string: return bcf_type_descriptor::char8; - case var_io::dynamic_type_id::int8: - case var_io::dynamic_type_id::vector_of_int8: - case var_io::dynamic_type_id::flag: + case var_io::value_type_id::int8: + case var_io::value_type_id::vector_of_int8: + case var_io::value_type_id::flag: return bcf_type_descriptor::int8; - case var_io::dynamic_type_id::int16: - case var_io::dynamic_type_id::vector_of_int16: + case var_io::value_type_id::int16: + case var_io::value_type_id::vector_of_int16: return bcf_type_descriptor::int16; - case var_io::dynamic_type_id::int32: - case var_io::dynamic_type_id::vector_of_int32: + case var_io::value_type_id::int32: + case var_io::value_type_id::vector_of_int32: return bcf_type_descriptor::int32; - case var_io::dynamic_type_id::float32: - case var_io::dynamic_type_id::vector_of_float32: + case var_io::value_type_id::float32: + case var_io::value_type_id::vector_of_float32: return bcf_type_descriptor::float32; } return bcf_type_descriptor::missing; diff --git a/include/bio/var_io/reader.hpp b/include/bio/var_io/reader.hpp index 8057166..d7d6abc 100644 --- a/include/bio/var_io/reader.hpp +++ b/include/bio/var_io/reader.hpp @@ -55,7 +55,7 @@ namespace bio::var_io * are returned by default also correspond to VCF specification (i.e. 1-based positions, string as strings and not * as numbers) **with one exception:** the genotypes are not grouped by sample (as in the VCF format) but by * genotype field (as in the BCF format). - * This results in a notably better performance when reading BCF files. See below for information on how to change + * This results in a notably better performance when reading BCF files. See below for information on how to change * this. * * This reader supports the following formats: diff --git a/include/bio/var_io/reader_options.hpp b/include/bio/var_io/reader_options.hpp index c82d93f..9f4c547 100644 --- a/include/bio/var_io/reader_options.hpp +++ b/include/bio/var_io/reader_options.hpp @@ -24,7 +24,7 @@ #include #include #include -#include + #include #include @@ -43,32 +43,18 @@ namespace bio::detail template concept info_element_reader_concept = detail::decomposable_into_two && (detail::out_string> || - std::same_as>)&&detail::is_dynamic_type>; + std::same_as>)&&detail::is_info_element_value_type>; //!\endcond -/*!\interface bio::detail::genotype_bcf_style_reader_concept <> +/*!\interface bio::detail::genotype_reader_concept <> * \tparam t The type to check. * \brief Types "similar" to bio::var_io::genotype_element / bio::var_io::genotype_element_bcf. */ //!\cond CONCEPT_DEF template -concept genotype_bcf_style_reader_concept = detail::decomposable_into_two && +concept genotype_reader_concept = detail::decomposable_into_two && (detail::out_string> || - std::same_as>)&&detail::is_dynamic_vector_type>; -//!\endcond - -/*!\interface bio::detail::genotypes_vcf_style_reader_concept <> - * \tparam t The type to check. - * \brief Types "similar" to bio::var_io::genotypes_vcf_style - */ -//!\cond CONCEPT_DEF -template -concept genotypes_vcf_style_reader_concept = - detail::decomposable_into_two && detail::back_insertable> && - detail::out_string>> && - detail::vector_like> && - detail::vector_like>> && - detail::is_dynamic_type>>>; + std::same_as>)&&detail::is_genotype_element_value_type>; //!\endcond } // namespace bio::detail @@ -91,76 +77,63 @@ namespace bio::var_io * If you are new to the way options are set in this library, have a look bio::seq_io::reader * and bio::seq_io::reader_options first, as those are much simpler. * - * ### Field types (beginner's guide) + * ## Field types * * The internal representation of VCF and BCF are different. To be able to freely * interchange between these formats, this library needs to choose one representation that * everything is converted to when being read. * - * The default representation is good for reading any kind format with only very little overhead. - * It is very close to the VCF format, except that the Genotypes are grouped "by-genotype" and - * not "by-sample". - * The records are shallow which means that even a copy of the current record becomes invalid as - * soon as the next record is read. This is good for performance :) + * Changing the field_types member configures the reader to return data in different types. + * One thing that is fixed for all configurations in this library is the layout of the GENOTYPES field + * which is always grouped "by-field" (BCF-style) and not "by-sample" (VCF-style). + * Another important choice is that **numbers are always 1-based,** because this is the default in VCF + * and all other tools that deal with VCF/BCF. * - * ### Field types (advanced guide) + * Beyond that, a wide variety of types are supported per field (see below), but most users will be happy + * with one of the predefined sets. * - * Changing the field_types member configures the reader to return data in different types/formats. - * A wide variety of types are supported per field (see below), but most users will be happy with - * one of the predefined sets. - * Some of these represent certain formats more closely, but any format can be read in any - * representation. + * ### Pre-defined tags * - * Three "styles" of field types are predefined: + * Two "styles" of field types are predefined: * - * 1. bio::var_io::field_types (the default; VCF-style with BCF-style genotypes) + * 1. bio::var_io::field_types (the default) * * All "strings" are represented as strings. - * * Genotypes are encoded by-genotype (BCF-style) but with text-id (see bio::var_io::genotype_element). - * 2. bio::var_io::field_types_vcf_style (VCF-style) - * * All "strings" are represented as strings. - * * Genotypes are encoded by-sample (see bio::var_io::genotypes_vcf_style). - * * This might be slightly faster when you only read and write VCF, but it has a significant overhead when reading - * or writing BCF! - * 3. bio::var_io::field_types_bcf_style (BCF-style) + * 2. bio::var_io::field_types_bcf_style (BCF-style) * * Most "strings" are represented by their in-header IDX value (see the BCF spec for more details). - * * Genotypes are encoded by-genotype (see bio::var_io::genotype_element_bcf). - * * When reading or writing VCF, this is slower than the default. - * * When reading or writing BCF, the deep version of this style is faster that the deep version of the default style, - * but for the shallow versions there is almost no difference. * * When reading and writing, you need to make sure that the IDX values in the output header are the same as in the * input header, otherwise your record fields might change meaning or even become invalid. * - * **Numbers are always 1-based,** because this is the default in VCF and all other tools that - * deal with VCF/BCF. - * - * This example shows how to switch to vcf-style and deactivate reading of BCF (not required - * but recommended): - * - * \snippet test/snippet/var_io/var_io_reader_options.cpp field_types_vcf_only - * - * All of the above styles are "shallow" by default, but can be configured to be "deep": + * Both styles are "shallow" by default, but can be configured to be "deep": * * 1. shallow (bio::ownership::shallow) * * The record contains light-weight data structures like views. - * * Fewer allocations, lower overhead during reading. - * * Record cannot be "stored"; it depends on internal caches and buffers, and it becomes invalid when the file + * * Record cannot be "stored"; it depends on internal caches and buffers, and it becomes invalid * as soon as the next record is read from the file. * 2. deep (bio::ownership::deep) * * The record is self-contained; sequences and strings are stored in containers. - * * This implies at least one copy-operation and likely also allocations during reading. * * Record can be copied or stored and can "live on" independently of the reader. * * This example shows how to use deep records: * * \snippet test/snippet/var_io/var_io_reader_options.cpp field_types_deep * - * ### Field types (expert guide) + * Performance implications: + * * Shallow records imply fewer allocations and lower overhead during reading. + * * If you know that you need to copy your fields anyway, using a deep record can be faster than using a shallow + * record and copying the data "manually" out of that (because certain internal caches are re-used to create deep + * records). + * * field_types_bcf_style is faster than field_types, but for the shallow variants + * there is almost no difference. + * + * TODO some of this should be moved to a general documentation page on configuring records; shallow vs deep; etc + * + * ### Manual configuration * * This section is only relevant if you specify the #field_types member manually via * a bio::ttag, i.e. if you change the field_types but do not use one of the predefined tags * (see above). * - * The following types are valid for the respective fields: + * The following types are valid for the respective fields and you can mix-and-match shallow/deep and integral/text IDs: * * 1. bio::field::chrom * * string or string_view: The chromosome string is returned. @@ -184,27 +157,23 @@ namespace bio::var_io * * back-insertable range of string or string_view: The filters as strings. * * back-insertable range of `int32_t`: The IDX values of the filters. * 8. bio::field::info - * * back-insertable range of elements similar to bio::var_io::info_element - * * *similar* means any type decomposable into two elements (`struct` or tuple) where the - * first is either a string[_view] or `int32_t` (IDX) and the second is bio::var_io::dynamic_type. + * * back-insertable range of elements "similar" to bio::var_io::info_element: + * * The elements must be decomposable into two subelements (`struct` or tuple). + * * The first subelement must be either a string[_view] (ID) or `int32_t` (IDX). + * * The second subelement must be bio::var_io::info_element_value_type. * 9. field::genotypes - * 1. A range (that supports back-insertion) over elements that are "similar" to - * bio::var_io::genotype_element: + * * back-insertable range of elements "similar" to bio::var_io::genotype_element: * * The elements must be decomposable into exactly two sub-elements (either `struct` or tuple). - * * The first subelement must be a string[_view] (ID) or `int32_t` (IDX). - * * The second subelement must bio::var_io::dynamic_vector_type. - * 2. Or: A type similar to bio::var_io::genotypes_vcf : - * * It must be decomposable into exactly two sub-elements (either `struct` or tuple). - * * The first subelement must be a range over string[_views] that supports back-insertion (FORMAT strings). - * * The second subelement must range-of-range over bio::var_io::dynamic_type and both - * range-dimensions need to support back-insertion (SAMPLE columns with genotype entries). + * * The first subelement must be either a string[_view] (ID) or `int32_t` (IDX). + * * The second subelement must bio::var_io::genotype_element_value_type. * * This example shows how to read only a subset of the available fields and manually specify their type: * * \snippet test/snippet/var_io/var_io_reader_options.cpp field_types_expert * * Reading fewer fields than available may provide a noticeable speed-up since only the - * requested fields are actually parsed. + * requested fields are actually parsed. Any field may also be set to `std::span` which + * results in no parsing happening for that field. * */ template (auto) requires( !field_ids_t::contains(field::genotypes) || (detail::back_insertable> && - detail::genotype_bcf_style_reader_concept< - std::remove_reference_t>>>) || - detail::genotypes_vcf_style_reader_concept>) { + detail::genotype_reader_concept< + std::remove_reference_t>>>)) { return std::true_type{}; }), "Requirements for the field-type of the GENOTYPES-field not met. See documentation for " diff --git a/include/bio/var_io/writer_options.hpp b/include/bio/var_io/writer_options.hpp index 7a93c71..bda455e 100644 --- a/include/bio/var_io/writer_options.hpp +++ b/include/bio/var_io/writer_options.hpp @@ -21,53 +21,72 @@ namespace bio::detail { -/*!\interface bio::detail::info_element_writer_concept <> +template +concept var_io_legal_type_aux = + std::same_as || std::signed_integral || std::floating_point || std::same_as < std::decay_t, +char const * > ; + +/*!\interface bio::detail::var_io_legal_type <> * \tparam t The type to check. - * \brief Types "similar" to bio::var_io::info_element / bio::var_io::info_element_bcf. + * \brief A type that is similar to one of the alternatives of bio::var_io::info_element_value_type */ //!\cond CONCEPT_DEF template -concept info_element_writer_concept = detail::decomposable_into_two && - (detail::char_range_or_cstring> || - std::same_as>)&&detail::var_io_legal_or_dynamic>; +concept var_io_legal_type = var_io_legal_type_aux> || std::same_as || + (std::ranges::forward_range && (var_io_legal_type_aux>> || + (std::ranges::forward_range> && + std::same_as const &>))); //!\endcond -/*!\interface bio::detail::genotype_bcf_style_writer_concept <> +/*!\interface bio::detail::var_io_legal_vector_type <> * \tparam t The type to check. - * \brief Types "similar" to bio::var_io::genotype_element / bio::var_io::genotype_element_bcf. + * \brief A type that is similar to one of the alternatives of bio::var_io::info_element_value_type */ //!\cond CONCEPT_DEF template -concept genotype_bcf_style_writer_concept = detail::decomposable_into_two && - (detail::char_range_or_cstring> || - std::same_as>)&&detail::var_io_vector_legal_or_dynamic>; +concept var_io_legal_vector_type = + std::ranges::forward_range && var_io_legal_type> && + !std::same_as>; +//!\endcond + +/*!\interface bio::detail::var_io_legal_or_dynamic <> + * \tparam t The type to check. + * \brief A type that is similar to one of the alternatives of bio::var_io::info_element_value_type + */ +//!\cond CONCEPT_DEF +template +concept var_io_legal_or_dynamic = var_io_legal_type || is_info_element_value_type; //!\endcond +/*!\interface bio::detail::var_io_vector_legal_or_dynamic <> + * \tparam t The type to check. + * \brief A type that is similar to one of the alternatives of bio::var_io::info_element_value_type + */ +//!\cond CONCEPT_DEF template -concept genotypes_vcf_style_format_writer_concept = - std::ranges::forward_range && detail::char_range_or_cstring>; +concept var_io_vector_legal_or_dynamic = var_io_legal_vector_type || is_genotype_element_value_type; +//!\endcond +/*!\interface bio::detail::info_element_writer_concept <> + * \tparam t The type to check. + * \brief Types "similar" to bio::var_io::info_element / bio::var_io::info_element_bcf. + */ +//!\cond CONCEPT_DEF template -concept genotypes_vcf_style_onesample_writer_concept = - std::ranges::forward_range && detail::var_io_legal_or_dynamic>; +concept info_element_writer_concept = detail::decomposable_into_two && + (detail::char_range_or_cstring> || + std::same_as>)&&detail::var_io_legal_or_dynamic>; +//!\endcond -/*!\interface bio::detail::genotypes_vcf_style_writer_concept <> +/*!\interface bio::detail::genotype_writer_concept <> * \tparam t The type to check. - * \brief Types "similar" to bio::var_io::genotypes_vcf_style + * \brief Types "similar" to bio::var_io::genotype_element / bio::var_io::genotype_element_bcf. */ //!\cond CONCEPT_DEF template -concept genotypes_vcf_style_writer_concept = detail::decomposable_into_two && - genotypes_vcf_style_format_writer_concept> && - ((std::ranges::forward_range> && - genotypes_vcf_style_onesample_writer_concept>>) || - requires // a tuple whose elements satisfy genotypes_vcf_style_onesample_writer_concept - { - requires decltype(std::apply( - [](elem_t...) - -> std::bool_constant && ...)*/> { return {}; }, - std::declval>()))::value; - }); +concept genotype_writer_concept = detail::decomposable_into_two && + (detail::char_range_or_cstring> || + std::same_as>)&&detail::var_io_vector_legal_or_dynamic>; //!\endcond } // namespace bio::detail diff --git a/test/format/CMakeLists.txt b/test/format/CMakeLists.txt index ef6df34..d7c883e 100644 --- a/test/format/CMakeLists.txt +++ b/test/format/CMakeLists.txt @@ -8,19 +8,19 @@ if (CLANG_FORMAT STREQUAL "CLANG_FORMAT-NOTFOUND") endif() if (NOT CLANG_FORMAT STREQUAL "CLANG_FORMAT-NOTFOUND") - add_custom_target (check_library "find" "${CMAKE_CURRENT_SOURCE_DIR}/../../include/bio/" "-name" "'*.hpp'" "-exec" + add_custom_target (check_library "find" "${CMAKE_CURRENT_SOURCE_DIR}/../../include/bio/" "-name" "'*.[ch]pp'" "-exec" ${CLANG_FORMAT} "-style=file" "-n" "-Werror" "{}" "+" COMMENT "Checking the library with clang-format.") - add_custom_target (check_unit_tests "find" "${CMAKE_CURRENT_SOURCE_DIR}/../unit/" "-name" "'*.cpp'" "-exec" + add_custom_target (check_unit_tests "find" "${CMAKE_CURRENT_SOURCE_DIR}/../unit/" "-name" "'*.[ch]pp'" "-exec" ${CLANG_FORMAT} "-style=file" "-n" "-Werror" "{}" "+" COMMENT "Checking the unit tests with clang-format.") - add_custom_target (format_library "find" "${CMAKE_CURRENT_SOURCE_DIR}/../../include/bio/" "-name" "'*.hpp'" "-exec" + add_custom_target (format_library "find" "${CMAKE_CURRENT_SOURCE_DIR}/../../include/bio/" "-name" "'*.[ch]pp'" "-exec" ${CLANG_FORMAT} "-style=file" "-i" "{}" "+" COMMENT "Format the library with clang-format.") - add_custom_target (format_unit_tests "find" "${CMAKE_CURRENT_SOURCE_DIR}/../unit/" "-name" "'*.cpp'" "-exec" + add_custom_target (format_unit_tests "find" "${CMAKE_CURRENT_SOURCE_DIR}/../unit/" "-name" "'*.[ch]pp'" "-exec" ${CLANG_FORMAT} "-style=file" "-i" "{}" "+" COMMENT "Format the unit tests with clang-format.") else () diff --git a/test/snippet/var_io/var_io_reader_options.cpp b/test/snippet/var_io/var_io_reader_options.cpp index 72fc10f..04f7eb8 100644 --- a/test/snippet/var_io/var_io_reader_options.cpp +++ b/test/snippet/var_io/var_io_reader_options.cpp @@ -19,31 +19,17 @@ int main() //================= SNIPPETS ====================== -{ -//![field_types_vcf_only] -// Only allow VCF-format (not BCF) and use full VCF style representation -bio::var_io::reader_options options{ - .field_types = bio::var_io::field_types_vcf_style<>, - .formats = bio::ttag -}; - -bio::var_io::reader reader{"example.vcf", options}; - -/*...*/ -//![field_types_vcf_only] -} - { //![field_types_deep] // this results in the records becoming "copyable" bio::var_io::reader_options options{ .field_types = bio::var_io::field_types }; -bio::var_io::reader reader{"example.vcf", options}; +// read the entire file, copy all records into a vector; immediately closes file again +std::vector records = bio::var_io::reader{"example.vcf", options} | seqan3::views::to; -// read the entire file, copy all records into a vector -std::vector records = reader | seqan3::views::to; +/* do something else */ -// process them later-on +// process the records later-on for (auto & rec : records) { seqan3::debug_stream << rec.chrom() << ':' diff --git a/test/snippet/var_io/var_io_writer.cpp b/test/snippet/var_io/var_io_writer.cpp index dd800c3..d830334 100644 --- a/test/snippet/var_io/var_io_writer.cpp +++ b/test/snippet/var_io/var_io_writer.cpp @@ -50,7 +50,7 @@ rec.info().push_back({.id = "AF", .value = std::vector{0.5f}}); // AF is vec /* genotypes is vector over bio::var_io::genotype_element */ rec.genotypes().push_back({ .id = "GT", .value = std::vector{"0|0"s, "1|0"s, "1/1"s}}); -// value in genotype is always a vector of size == number of samples; see bio::var_io::dynamic_vector_type +// value in genotype is always a vector of size == number of samples; see bio::var_io::genotype_element_value_type writer.push_back(rec); diff --git a/test/unit/format/bcf_input_test.cpp b/test/unit/format/bcf_input_test.cpp index 26378c1..87448b7 100644 --- a/test/unit/format/bcf_input_test.cpp +++ b/test/unit/format/bcf_input_test.cpp @@ -101,7 +101,6 @@ TEST(bcf, iterator_underflow) enum class style { def, - vcf, bcf }; @@ -117,9 +116,7 @@ void field_types() using fields_t = std::conditional_t), - std::conditional_t), - decltype(bio::var_io::field_types_bcf_style)>>; + decltype(bio::var_io::field_types_bcf_style)>; using record_t = bio::record; using int_t = int8_t; @@ -129,24 +126,13 @@ void field_types() if constexpr (s == style::def) recs = example_records_default_style(); - else if constexpr (s == style::vcf) - recs = example_records_vcf_style(); else recs = example_records_bcf_style(); // this workaround is pending clarification in https://github.com/samtools/hts-specs/issues/593 - if constexpr (s == style::vcf) - { - bio::detail::get_second(recs[1].genotypes()).back().push_back(std::vector{mv}); - bio::detail::get_second(recs[2].genotypes()).back().push_back(std::vector{mv}); - bio::detail::get_second(recs[3].genotypes()).back().push_back(std::vector{mv}); - } - else - { - std::get>>(bio::detail::get_second(recs[1].genotypes().back())).push_back({mv}); - std::get>>(bio::detail::get_second(recs[2].genotypes().back())).push_back({mv}); - std::get>>(bio::detail::get_second(recs[3].genotypes().back())).push_back({mv}); - } + std::get>>(bio::detail::get_second(recs[1].genotypes().back())).push_back({mv}); + std::get>>(bio::detail::get_second(recs[2].genotypes().back())).push_back({mv}); + std::get>>(bio::detail::get_second(recs[3].genotypes().back())).push_back({mv}); for (auto & rec : recs) get(rec) = priv; @@ -179,16 +165,6 @@ TEST(bcf, field_types_default_style_deep) field_types(); } -TEST(bcf, field_types_vcf_style_shallow) -{ - field_types(); -} - -TEST(bcf, field_types_vcf_style_deep) -{ - field_types(); -} - TEST(bcf, field_types_bcf_style_shallow) { field_types(); diff --git a/test/unit/format/bcf_output_test.cpp b/test/unit/format/bcf_output_test.cpp index 6dfde91..f037c40 100644 --- a/test/unit/format/bcf_output_test.cpp +++ b/test/unit/format/bcf_output_test.cpp @@ -39,8 +39,6 @@ void field_types() { if constexpr (s == style::def) return example_records_default_style(); - else if constexpr (s == style::vcf) - return example_records_vcf_style(); else return example_records_bcf_style(); }(); @@ -62,17 +60,6 @@ TEST(bcf_output, default_style_deep) field_types(); } -// TODO no VCF-style, yet -// TEST(bcf_output, vcf_style_shallow) -// { -// field_types(); -// } -// -// TEST(bcf_output, vcf_style_deep) -// { -// field_types(); -// } - TEST(bcf_output, bcf_style_shallow) { field_types(); diff --git a/test/unit/format/vcf_data.hpp b/test/unit/format/vcf_data.hpp index a14ea90..36e5ad7 100644 --- a/test/unit/format/vcf_data.hpp +++ b/test/unit/format/vcf_data.hpp @@ -9,8 +9,8 @@ #include #include -#include #include +#include #include #include @@ -22,7 +22,7 @@ // https://samtools.github.io/hts-specs/VCFv4.3.pdf inline std::string const example_from_spec_records = -R"(20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. + R"(20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 @@ -30,7 +30,7 @@ R"(20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:5 )"; inline std::string const example_from_spec_header = -R"(##fileformat=VCFv4.3 + R"(##fileformat=VCFv4.3 ##fileDate=20090805 ##source=myImputationProgramV3.1 ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta @@ -54,7 +54,7 @@ R"(##fileformat=VCFv4.3 inline std::string const example_from_spec = example_from_spec_header + example_from_spec_records; inline std::string const example_from_spec_header_regenerated = -R"(##fileformat=VCFv4.3 + R"(##fileformat=VCFv4.3 ##FILTER= ##FILTER= ##FILTER= @@ -77,7 +77,7 @@ R"(##fileformat=VCFv4.3 )"; inline std::string const example_from_spec_header_regenerated_no_IDX = -R"(##fileformat=VCFv4.3 + R"(##fileformat=VCFv4.3 ##FILTER= ##FILTER= ##FILTER= @@ -100,58 +100,59 @@ R"(##fileformat=VCFv4.3 )"; inline std::string const example_from_spec_bgzipped{ -"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00\x36\x03\x8d" -"\x93\x5b\x73\xda\x3a\x14\x85\x9f\x95\x5f\xe1\x49\xe6\xbc\xf9\x18\x5d\x2c\x19" -"\xc4\x71\x67\x9c\xb8\x06\x66\x12\xc2\xc5\xa7\x7d\x16\x20\x83\x67\x7c\x8b\x25" -"\xda\x32\xd3\x1f\x5f\xd9\x86\x34\x38\x33\x6d\x5e\xb0\x25\x6b\x7d\x6b\xef\xb5" -"\xc5\xdd\x5d\x92\x66\x32\x29\xeb\x5c\x68\xff\xcb\x43\xf4\xcd\x75\xc8\xcd\x5d" -"\xbb\x19\x0a\x2d\x7d\x0c\xe1\x08\x0e\x21\x35\x7b\xaa\x3c\xd6\x5b\xe9\xe7\xa7" -"\x59\x5e\x1d\xb5\xd0\x69\x59\x2c\xea\x72\x5f\x8b\xfc\x0b\x71\x90\x39\x50\xcb" -"\x44\xd6\xb2\x30\x67\x1a\x39\x1f\x0c\x06\x4a\xbe\x0c\x5e\x77\xd5\x00\x41\x08" -"\x27\xb2\x28\x73\xa9\x16\x69\x56\xea\x7f\xe7\x0f\xf7\x33\xc2\x9c\x44\x28\x2d" -"\x0c\x60\x5b\x16\x3a\xdd\xfb\xff\xcd\x42\xe3\x6b\x67\xb2\xd8\xeb\x83\xcf\xb0" -"\x4b\xe8\x88\xb9\xb6\x50\x4a\xe6\x9b\xec\xe4\xdf\x13\x66\xe7\x3b\xea\x27\x08" -"\xb3\xed\x2e\x19\x0a\x26\xe1\xd6\x4b\x88\x37\xda\x31\x34\x4c\x12\xc6\x36\x72" -"\x83\x77\xc2\x56\x95\xdc\xa6\x52\xf9\xb7\xd3\x32\x2f\x2d\x25\xaa\x54\x16\xea" -"\xd6\xd6\xe2\x47\x69\x8a\x38\xf9\x3f\x3e\x19\xd3\xea\x20\x54\x5a\xec\xfd\x4a" -"\xd4\x3a\x15\x99\xd9\x99\xcd\xa3\xe7\xb6\x88\xf9\xda\x9e\x1f\xf3\x8d\xac\x7d" -"\x64\xc7\xa7\x4a\xfa\xb3\x42\xcb\xbd\xac\xed\x50\xaa\x6d\x9d\x56\x4d\x04\xfe" -"\x6d\x77\xc4\x2a\x13\x6b\x2d\xf2\x2a\x93\xca\xfa\x9a\xea\x83\x65\xe2\x13\xb7" -"\x9f\xde\xe2\xc2\xc5\x47\x70\x71\xa9\x45\x66\x85\xb2\xd2\x87\x6b\x79\x10\x5d" -"\xe4\x41\x27\x8f\xb2\x52\xe8\x6b\x71\x90\x65\x32\x93\x56\x54\xcb\x97\xa3\xc9" -"\xfc\xd4\x23\x04\xbd\x02\xd6\xba\x36\xad\xf7\x10\xcd\xa8\x74\x6d\x6a\xe8\x60" -"\xbd\x1e\xee\x2f\x08\x78\x29\x42\xf4\x00\xbb\xcd\x7a\xbe\xb0\x72\xd9\x9c\x52" -"\x87\xb4\xb2\xad\xcd\x31\xcd\x76\x16\xc2\xa3\x6b\xd6\x14\xff\x95\x35\x15\xd5" -"\x93\xa8\xf0\x1b\x5a\x8b\x88\x66\x8f\xf1\xe7\x55\x0b\x79\x41\xf0\x5a\xb2\x3c" -"\x8a\x2c\xd5\x27\x6b\x23\xb3\xf2\xbb\x85\x60\x5f\xa0\x68\x4f\xf0\x28\x95\xb2" -"\xf4\x41\x14\x16\x85\xff\x34\x63\x54\xe7\x31\x1e\xc4\x37\x69\xed\x2e\x63\x8c" -"\x9e\x57\x4f\x41\xdc\x22\x26\xf1\x07\x72\x6c\x2e\xba\x36\x1f\xdf\x89\x97\x1f" -"\xb9\x05\x17\xb5\x75\x6e\xa7\x4f\xf9\xd8\x5d\x5a\x49\xb1\x7b\x73\x95\xde\xe8" -"\xa7\xaf\x55\xe0\x3f\xe8\x4d\xfc\xd9\xbb\x32\x1e\xa6\xab\xe7\x27\xb0\x78\x5e" -"\x83\x59\x08\x56\x9f\x23\x10\x3c\xc6\x60\xf9\x7f\xf0\x08\xba\x94\x41\x33\x60" -"\xd0\x79\x81\x79\x60\xfe\xf3\x10\x9d\x9f\xf8\xfc\x24\x37\x18\x02\xe4\x12\x0f" -"\x82\x5a\x31\x48\x5d\x4c\x3d\x30\x01\x01\xc0\x23\xb0\x08\xd6\x6b\x30\x5f\xfb" -"\x64\x1c\x2e\x7c\xe4\x8e\x83\xc8\x87\x0e\x1d\x87\xf7\xe3\x29\x06\x93\x98\x4f" -"\x96\x3c\x5c\xf0\xe9\x12\xc0\x9f\x90\xbb\x43\x8e\x38\x45\x36\x45\x00\x75\xcb" -"\xe1\x65\x39\x40\xdc\x25\x9c\x72\xc7\x76\x5a\x37\x8f\x10\x08\x1c\x10\x1b\x17" -"\x02\xcc\xad\xf9\xed\x81\x3a\x0f\x88\xbc\xf7\xfc\x11\x37\x8c\xa1\x4d\xa1\x59" -"\xa2\xe6\x9d\x33\x6a\x13\x00\x07\xe6\x9b\x59\xb7\x64\x84\x20\x1b\xb1\xb6\x13" -"\x17\x12\x4a\x8d\xc3\xc4\x8e\x01\xf3\x5e\x7b\xc1\xad\x0f\xec\x7c\x08\x21\x36" -"\x74\x18\xf3\xc6\x41\xe0\xc7\xa6\xb1\x2b\x57\xf4\x13\x73\x8c\x38\xe3\x98\xd8" -"\xd8\x03\xd8\xb8\x62\x0e\x39\x1a\xda\x18\xe0\x01\xe6\x84\x72\xb7\x75\xc5\x04" -"\x62\xe2\xb5\x1d\x39\xc0\xf5\x7a\xb9\x91\x16\xfe\xae\x1f\xea\x72\x8f\x53\x66" -"\x33\x78\x89\xcf\x3d\xe7\xd5\x74\xc4\x8c\xd7\x99\xed\x52\x53\x7e\x9e\x6e\xeb" -"\x52\x09\x8d\x0c\xe7\xc1\xf4\x64\x7e\x63\x60\xa2\xb8\xb2\x1a\x35\x4e\x93\x57" -"\x27\x03\x42\x6d\x91\xe6\x05\x73\xe4\x71\xdc\x8d\x02\x72\x72\xf3\x0b\xb2\x85" -"\x75\xde\x6d\x06\x00\x00\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42" -"\x43\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00", 851}; + "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00\x36\x03\x8d" + "\x93\x5b\x73\xda\x3a\x14\x85\x9f\x95\x5f\xe1\x49\xe6\xbc\xf9\x18\x5d\x2c\x19" + "\xc4\x71\x67\x9c\xb8\x06\x66\x12\xc2\xc5\xa7\x7d\x16\x20\x83\x67\x7c\x8b\x25" + "\xda\x32\xd3\x1f\x5f\xd9\x86\x34\x38\x33\x6d\x5e\xb0\x25\x6b\x7d\x6b\xef\xb5" + "\xc5\xdd\x5d\x92\x66\x32\x29\xeb\x5c\x68\xff\xcb\x43\xf4\xcd\x75\xc8\xcd\x5d" + "\xbb\x19\x0a\x2d\x7d\x0c\xe1\x08\x0e\x21\x35\x7b\xaa\x3c\xd6\x5b\xe9\xe7\xa7" + "\x59\x5e\x1d\xb5\xd0\x69\x59\x2c\xea\x72\x5f\x8b\xfc\x0b\x71\x90\x39\x50\xcb" + "\x44\xd6\xb2\x30\x67\x1a\x39\x1f\x0c\x06\x4a\xbe\x0c\x5e\x77\xd5\x00\x41\x08" + "\x27\xb2\x28\x73\xa9\x16\x69\x56\xea\x7f\xe7\x0f\xf7\x33\xc2\x9c\x44\x28\x2d" + "\x0c\x60\x5b\x16\x3a\xdd\xfb\xff\xcd\x42\xe3\x6b\x67\xb2\xd8\xeb\x83\xcf\xb0" + "\x4b\xe8\x88\xb9\xb6\x50\x4a\xe6\x9b\xec\xe4\xdf\x13\x66\xe7\x3b\xea\x27\x08" + "\xb3\xed\x2e\x19\x0a\x26\xe1\xd6\x4b\x88\x37\xda\x31\x34\x4c\x12\xc6\x36\x72" + "\x83\x77\xc2\x56\x95\xdc\xa6\x52\xf9\xb7\xd3\x32\x2f\x2d\x25\xaa\x54\x16\xea" + "\xd6\xd6\xe2\x47\x69\x8a\x38\xf9\x3f\x3e\x19\xd3\xea\x20\x54\x5a\xec\xfd\x4a" + "\xd4\x3a\x15\x99\xd9\x99\xcd\xa3\xe7\xb6\x88\xf9\xda\x9e\x1f\xf3\x8d\xac\x7d" + "\x64\xc7\xa7\x4a\xfa\xb3\x42\xcb\xbd\xac\xed\x50\xaa\x6d\x9d\x56\x4d\x04\xfe" + "\x6d\x77\xc4\x2a\x13\x6b\x2d\xf2\x2a\x93\xca\xfa\x9a\xea\x83\x65\xe2\x13\xb7" + "\x9f\xde\xe2\xc2\xc5\x47\x70\x71\xa9\x45\x66\x85\xb2\xd2\x87\x6b\x79\x10\x5d" + "\xe4\x41\x27\x8f\xb2\x52\xe8\x6b\x71\x90\x65\x32\x93\x56\x54\xcb\x97\xa3\xc9" + "\xfc\xd4\x23\x04\xbd\x02\xd6\xba\x36\xad\xf7\x10\xcd\xa8\x74\x6d\x6a\xe8\x60" + "\xbd\x1e\xee\x2f\x08\x78\x29\x42\xf4\x00\xbb\xcd\x7a\xbe\xb0\x72\xd9\x9c\x52" + "\x87\xb4\xb2\xad\xcd\x31\xcd\x76\x16\xc2\xa3\x6b\xd6\x14\xff\x95\x35\x15\xd5" + "\x93\xa8\xf0\x1b\x5a\x8b\x88\x66\x8f\xf1\xe7\x55\x0b\x79\x41\xf0\x5a\xb2\x3c" + "\x8a\x2c\xd5\x27\x6b\x23\xb3\xf2\xbb\x85\x60\x5f\xa0\x68\x4f\xf0\x28\x95\xb2" + "\xf4\x41\x14\x16\x85\xff\x34\x63\x54\xe7\x31\x1e\xc4\x37\x69\xed\x2e\x63\x8c" + "\x9e\x57\x4f\x41\xdc\x22\x26\xf1\x07\x72\x6c\x2e\xba\x36\x1f\xdf\x89\x97\x1f" + "\xb9\x05\x17\xb5\x75\x6e\xa7\x4f\xf9\xd8\x5d\x5a\x49\xb1\x7b\x73\x95\xde\xe8" + "\xa7\xaf\x55\xe0\x3f\xe8\x4d\xfc\xd9\xbb\x32\x1e\xa6\xab\xe7\x27\xb0\x78\x5e" + "\x83\x59\x08\x56\x9f\x23\x10\x3c\xc6\x60\xf9\x7f\xf0\x08\xba\x94\x41\x33\x60" + "\xd0\x79\x81\x79\x60\xfe\xf3\x10\x9d\x9f\xf8\xfc\x24\x37\x18\x02\xe4\x12\x0f" + "\x82\x5a\x31\x48\x5d\x4c\x3d\x30\x01\x01\xc0\x23\xb0\x08\xd6\x6b\x30\x5f\xfb" + "\x64\x1c\x2e\x7c\xe4\x8e\x83\xc8\x87\x0e\x1d\x87\xf7\xe3\x29\x06\x93\x98\x4f" + "\x96\x3c\x5c\xf0\xe9\x12\xc0\x9f\x90\xbb\x43\x8e\x38\x45\x36\x45\x00\x75\xcb" + "\xe1\x65\x39\x40\xdc\x25\x9c\x72\xc7\x76\x5a\x37\x8f\x10\x08\x1c\x10\x1b\x17" + "\x02\xcc\xad\xf9\xed\x81\x3a\x0f\x88\xbc\xf7\xfc\x11\x37\x8c\xa1\x4d\xa1\x59" + "\xa2\xe6\x9d\x33\x6a\x13\x00\x07\xe6\x9b\x59\xb7\x64\x84\x20\x1b\xb1\xb6\x13" + "\x17\x12\x4a\x8d\xc3\xc4\x8e\x01\xf3\x5e\x7b\xc1\xad\x0f\xec\x7c\x08\x21\x36" + "\x74\x18\xf3\xc6\x41\xe0\xc7\xa6\xb1\x2b\x57\xf4\x13\x73\x8c\x38\xe3\x98\xd8" + "\xd8\x03\xd8\xb8\x62\x0e\x39\x1a\xda\x18\xe0\x01\xe6\x84\x72\xb7\x75\xc5\x04" + "\x62\xe2\xb5\x1d\x39\xc0\xf5\x7a\xb9\x91\x16\xfe\xae\x1f\xea\x72\x8f\x53\x66" + "\x33\x78\x89\xcf\x3d\xe7\xd5\x74\xc4\x8c\xd7\x99\xed\x52\x53\x7e\x9e\x6e\xeb" + "\x52\x09\x8d\x0c\xe7\xc1\xf4\x64\x7e\x63\x60\xa2\xb8\xb2\x1a\x35\x4e\x93\x57" + "\x27\x03\x42\x6d\x91\xe6\x05\x73\xe4\x71\xdc\x8d\x02\x72\x72\xf3\x0b\xb2\x85" + "\x75\xde\x6d\x06\x00\x00\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42" + "\x43\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00", + 851}; //============================================================================= // minimal fields //============================================================================= inline std::string const minimal_field_rows = -R"(20 14370 . G . . . . . . . . + R"(20 14370 . G . . . . . . . . 20 17330 . T . . . . . . . . 20 1110696 . A . . . . . . . . 20 1230237 . T . . . . . . . . @@ -163,7 +164,7 @@ R"(20 14370 . G . . . . . . . . //============================================================================= inline std::string const incomplete_header_before = -R"(##fileformat=VCFv4.3 + R"(##fileformat=VCFv4.3 ##FILTER= ##INFO= ##FORMAT= @@ -175,7 +176,7 @@ R"(##fileformat=VCFv4.3 )"; inline std::string const incomplete_header_after = -R"(##fileformat=VCFv4.3 + R"(##fileformat=VCFv4.3 ##FILTER= ##FILTER= ##INFO= @@ -196,7 +197,6 @@ R"(##fileformat=VCFv4.3 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 )"; - //============================================================================= // records //============================================================================= @@ -213,7 +213,7 @@ namespace seqan3 { template -//!\cond + //!\cond requires std::same_as, bio::var_io::record_private_data> //!\endcond inline debug_stream_type & operator<<(debug_stream_type & s, byte_type &&) @@ -221,7 +221,6 @@ inline debug_stream_type & operator<<(debug_stream_type & s, byt return s; } - template requires bio::detail::aggregate_of_two> inline debug_stream_type & operator<<(debug_stream_type & s, agg_t && agg) @@ -230,14 +229,16 @@ inline debug_stream_type & operator<<(debug_stream_type & s, agg return s; } - -} // namespace bio +} // namespace seqan3 template auto make_ref(std::string_view const str) { if constexpr (own == bio::ownership::shallow) - return tf_view{{str, {}}, {}}; + return tf_view{ + {str, {}}, + { } + }; else return str | seqan3::views::char_strictly_to | seqan3::views::to; } @@ -245,17 +246,15 @@ auto make_ref(std::string_view const str) template auto example_records_default_style() { - using record_t = bio::record)>; + using record_t = bio::record)>; bio::var_io::record_private_data priv{}; - constexpr int_t mv = bio::var_io::missing_value; - using ivec = std::vector; - using ivecvec = std::vector>; - using fvec = std::vector; - using svec = std::conditional_t, - std::vector>; + constexpr int_t mv = bio::var_io::missing_value; + using ivec = std::vector; + using ivecvec = std::vector>; + using fvec = std::vector; + using svec = + std::conditional_t, std::vector>; // clang-format off std::vector recs{ @@ -267,48 +266,22 @@ auto example_records_default_style() }; // clang-format on - return recs; - -} - -template -auto example_records_vcf_style() -{ - using record_t = bio::record)>; - - bio::var_io::record_private_data priv{}; - constexpr int_t mv = bio::var_io::missing_value; - using ivec = std::vector; - using fvec = std::vector; - - // clang-format off - std::vector recs{ - {"20", 14370, "rs6054257", make_ref("G"), {"A"}, 29, {"PASS"}, {{"NS",(int_t)3}, {"DP", (int_t)14}, {"AF", fvec{0.5f}}, {"DB", true}, {"H2", true} }, { {"GT", "GQ", "DP", "HQ"}, {{"0|0", (int_t)48,(int_t)1,ivec{51,51}}, {"1|0",(int_t)48,(int_t)8,ivec{51,51}}, {"1/1",(int_t)43,(int_t)5,ivec{mv,mv}}}}, priv}, - {"20", 17330, ".", make_ref("T"), {"A"}, 3, {"q10"}, {{"NS",(int_t)3}, {"DP", (int_t)11}, {"AF", fvec{0.017f}} }, { {"GT", "GQ", "DP", "HQ"}, {{"0|0", (int_t)49,(int_t)3,ivec{58,50}}, {"0|1",(int_t) 3,(int_t)5,ivec{65, 3}}, {"0/0",(int_t)41,(int_t)3 }}}, priv}, - {"20", 1110696, "rs6040355", make_ref("A"), {"G","T"}, 67, {"PASS"}, {{"NS",(int_t)2}, {"DP", (int_t)10}, {"AF", fvec{0.333f,0.667f}}, {"AA", "T"}, {"DB", true}}, { {"GT", "GQ", "DP", "HQ"}, {{"1|2", (int_t)21,(int_t)6,ivec{23,27}}, {"2|1",(int_t) 2,(int_t)0,ivec{18, 2}}, {"2/2",(int_t)35,(int_t)4 }}}, priv}, - {"20", 1230237, ".", make_ref("T"), {}, 47, {"PASS"}, {{"NS",(int_t)3}, {"DP", (int_t)13}, {"AA", "T"} }, { {"GT", "GQ", "DP", "HQ"}, {{"0|0", (int_t)54,(int_t)7,ivec{56,60}}, {"0|0",(int_t)48,(int_t)4,ivec{51,51}}, {"0/0",(int_t)61,(int_t)2 }}}, priv}, - {"20", 1234567, "microsat1", make_ref("GTC"), {"G","GTCT"}, 50, {"PASS"}, {{"NS",(int_t)3}, {"DP", (int_t)9 }, {"AA", "G"} }, { {"GT", "GQ", "DP" }, {{"0/1", (int_t)35,(int_t)4 }, {"0/2",(int_t)17,(int_t)2 }, {"1/1",(int_t)40,(int_t)3 }}}, priv} - }; - // clang-format on - return recs; } template auto example_records_bcf_style() { - using record_t = bio::record)>; + using record_t = + bio::record)>; bio::var_io::record_private_data priv{}; - constexpr int_t mv = bio::var_io::missing_value; - using ivec = std::vector; - using ivecvec = std::vector>; - using fvec = std::vector; - using svec = std::conditional_t, - std::vector>; + constexpr int_t mv = bio::var_io::missing_value; + using ivec = std::vector; + using ivecvec = std::vector>; + using fvec = std::vector; + using svec = + std::conditional_t, std::vector>; // clang-format off std::vector recs{ @@ -328,12 +301,13 @@ auto example_records_novariant() using namespace std::string_view_literals; bio::var_io::record_private_data priv{}; - constexpr int32_t mv = bio::var_io::missing_value; - using ivec = std::vector; - using ivecvec = std::vector>; - using fvec = std::vector; - using svec = std::vector; + constexpr int32_t mv = bio::var_io::missing_value; + using ivec = std::vector; + using ivecvec = std::vector>; + using fvec = std::vector; + using svec = std::vector; + // clang-format off auto rec0 = bio::make_record(bio::var_io::default_field_ids, "20", 14370, @@ -350,7 +324,7 @@ auto example_records_novariant() std::tuple{std::pair{"GT", svec{"0|0", "1|0", "1/1"}}, std::pair{"GQ", ivec{48, 48, 43}}, std::pair{"DP", ivec{1, 8, 5}}, - std::pair{"HQ", ivecvec{{51,51}, {51,51}, {mv,mv} }}}, + std::pair{"HQ", ivecvec{{51, 51}, {51, 51}, {mv, mv}}}}, priv); auto rec1 = bio::make_record(bio::var_io::default_field_ids, @@ -367,7 +341,7 @@ auto example_records_novariant() std::tuple{std::pair{"GT", svec{"0|0", "0|1", "0/0"}}, std::pair{"GQ", ivec{49, 3, 41}}, std::pair{"DP", ivec{3, 5, 3}}, - std::pair{"HQ", ivecvec{{58,50}, {65, 3}}}}, + std::pair{"HQ", ivecvec{{58, 50}, {65, 3}}}}, priv); auto rec2 = bio::make_record(bio::var_io::default_field_ids, @@ -386,7 +360,7 @@ auto example_records_novariant() std::tuple{std::pair{"GT", svec{"1|2", "2|1", "2/2"}}, std::pair{"GQ", ivec{21, 2, 35}}, std::pair{"DP", ivec{6, 0, 4}}, - std::pair{"HQ", ivecvec{{23,27}, {18, 2}}}}, + std::pair{"HQ", ivecvec{{23, 27}, {18, 2}}}}, priv); auto rec3 = bio::make_record(bio::var_io::default_field_ids, @@ -403,8 +377,9 @@ auto example_records_novariant() std::tuple{std::pair{"GT", svec{"0|0", "0|0", "0/0"}}, std::pair{"GQ", ivec{54, 48, 61}}, std::pair{"DP", ivec{7, 4, 2}}, - std::pair{"HQ", ivecvec{{56,60}, {51,51}}}}, + std::pair{"HQ", ivecvec{{56, 60}, {51, 51}}}}, priv); + auto rec4 = bio::make_record(bio::var_io::default_field_ids, "20", 1234567, @@ -420,107 +395,6 @@ auto example_records_novariant() std::pair{"GQ", ivec{35, 17, 40}}, std::pair{"DP", ivec{4, 2, 3}}}, priv); - - return std::tuple{rec0, rec1, rec2, rec3, rec4}; -} - -auto example_records_novariant_vcf_style_genotypes() -{ - using namespace std::string_view_literals; - - bio::var_io::record_private_data priv{}; - constexpr int32_t mv = bio::var_io::missing_value; - using ivec = std::vector; - using fvec = std::vector; - using svec = std::vector; - - auto rec0 = bio::make_record(bio::var_io::default_field_ids, - "20", - 14370, - "rs6054257", - "G", - svec{"A"}, - 29.0, - svec{"PASS"}, - std::tuple{std::pair{"NS", 3}, - std::pair{"DP", 14}, - std::pair{"AF", fvec{0.5f}}, - std::pair{"DB", true}, - std::pair{"H2", true}}, - std::pair{svec{"GT", "GQ", "DP", "HQ"}, - std::tuple{std::tuple{"0|0", 48,1,ivec{51,51}}, - std::tuple{"1|0",48,8,ivec{51,51}}, - std::tuple{"1/1",43,5,ivec{mv,mv}}}}, - priv); - - auto rec1 = bio::make_record(bio::var_io::default_field_ids, - "20", - 17330, - ".", - "T", - svec{"A"}, - 3.0, - svec{"q10"}, - std::tuple{std::pair{"NS", 3}, - std::pair{"DP", 11}, - std::pair{"AF", fvec{0.017f}}}, - std::pair{svec{"GT", "GQ", "DP", "HQ"}, - std::tuple{std::tuple{"0|0",49,3,ivec{58,50}}, - std::tuple{"0|1", 3,5,ivec{65, 3}}, - std::tuple{"0/0",41,3}}}, - priv); - - auto rec2 = bio::make_record(bio::var_io::default_field_ids, - "20", - 1110696, - "rs6040355", - "A", - svec{"G","T"}, - 67, - svec{"PASS"}, - std::tuple{std::pair{"NS",2}, - std::pair{"DP", 10}, - std::pair{"AF", fvec{0.333f,0.667f}}, - std::pair{"AA", "T"}, - std::pair{"DB", true}}, - std::pair{svec{"GT", "GQ", "DP", "HQ"}, - std::tuple{std::tuple{"1|2",21,6,ivec{23,27}}, - std::tuple{"2|1", 2,0,ivec{18, 2}}, - std::tuple{"2/2",35,4}}}, - priv); - - auto rec3 = bio::make_record(bio::var_io::default_field_ids, - "20", - 1230237, - ".", - "T", - svec{}, - 47, - svec{"PASS"}, - std::tuple{std::pair{"NS",3}, - std::pair{"DP", 13}, - std::pair{"AA", "T"} }, - std::pair{svec{"GT", "GQ", "DP", "HQ"}, - std::tuple{std::tuple{"0|0",54,7,ivec{56,60}}, - std::tuple{"0|0",48,4,ivec{51,51}}, - std::tuple{"0/0",61,2}}}, - priv); - auto rec4 = bio::make_record(bio::var_io::default_field_ids, - "20", - 1234567, - "microsat1", - "GTC", - svec{"G","GTCT"}, - 50, - svec{"PASS"}, - std::tuple{std::pair{"NS",3}, - std::pair{"DP", 9 }, - std::pair{"AA", "G"}}, - std::pair{svec{"GT", "GQ", "DP"}, - std::tuple{std::tuple{"0/1",35,4}, - std::tuple{"0/2",17,2}, - std::tuple{"1/1",40,3}}}, - priv); - + // clang-format on return std::tuple{rec0, rec1, rec2, rec3, rec4}; } diff --git a/test/unit/format/vcf_input_test.cpp b/test/unit/format/vcf_input_test.cpp index 887be62..a521ac9 100644 --- a/test/unit/format/vcf_input_test.cpp +++ b/test/unit/format/vcf_input_test.cpp @@ -33,17 +33,13 @@ void field_types() using fields_t = std::conditional_t), - std::conditional_t), - decltype(bio::var_io::field_types_bcf_style)>>; + decltype(bio::var_io::field_types_bcf_style)>; using record_t = bio::record; std::vector recs; if constexpr (s == style::def) recs = example_records_default_style(); - else if constexpr (s == style::vcf) - recs = example_records_vcf_style(); else recs = example_records_bcf_style(); @@ -78,16 +74,6 @@ TEST(vcf, field_types_default_style_deep) field_types(); } -TEST(vcf, field_types_vcf_style_shallow) -{ - field_types(); -} - -TEST(vcf, field_types_vcf_style_deep) -{ - field_types(); -} - TEST(vcf, field_types_bcf_style_shallow) { field_types(); @@ -106,8 +92,7 @@ TEST(vcf, incomplete_header) std::istringstream istr{incomplete_header_before + example_from_spec_records}; - using record_t = - bio::record)>; + using record_t = bio::record)>; bio::format_input_handler handler{istr, bio::var_io::reader_options{.print_warnings = false}}; @@ -115,7 +100,7 @@ TEST(vcf, incomplete_header) bio::var_io::header const & hdr = handler.get_header(); - auto recs = example_records_vcf_style(); + auto recs = example_records_default_style(); for (auto & rec : recs) get(rec) = priv; diff --git a/test/unit/format/vcf_output_test.cpp b/test/unit/format/vcf_output_test.cpp index aa36d22..45b8ea1 100644 --- a/test/unit/format/vcf_output_test.cpp +++ b/test/unit/format/vcf_output_test.cpp @@ -18,7 +18,6 @@ enum class style { def, - vcf, bcf }; @@ -38,8 +37,6 @@ void field_types() { if constexpr (s == style::def) return example_records_default_style(); - else if constexpr (s == style::vcf) - return example_records_vcf_style(); else return example_records_bcf_style(); }(); @@ -61,16 +58,6 @@ TEST(vcf_output, default_style_deep) field_types(); } -TEST(vcf_output, vcf_style_shallow) -{ - field_types(); -} - -TEST(vcf_output, vcf_style_deep) -{ - field_types(); -} - TEST(vcf_output, bcf_style_shallow) { field_types(); @@ -99,22 +86,3 @@ TEST(vcf_output, novariant) EXPECT_EQ(ostr.str(), example_from_spec_header_regenerated_no_IDX + example_from_spec_records); } - -TEST(vcf_output, novariant_vcf_style_genotypes) -{ - std::ostringstream ostr{}; - - { - bio::format_output_handler handler{ostr, bio::var_io::writer_options{}}; - - bio::var_io::header hdr{example_from_spec_header}; - hdr.add_missing(); - handler.set_header(std::move(hdr)); - - auto records = example_records_novariant_vcf_style_genotypes(); // < records is a tuple here - - std::apply([&](auto &... recs) { (handler.write_record(recs), ...); }, records); - } - - EXPECT_EQ(ostr.str(), example_from_spec_header_regenerated_no_IDX + example_from_spec_records); -} diff --git a/test/unit/stream/data.hpp b/test/unit/stream/data.hpp index 1e0db22..a1f3679 100644 --- a/test/unit/stream/data.hpp +++ b/test/unit/stream/data.hpp @@ -18,29 +18,33 @@ template inline constexpr std::string_view compressed = ""; template <> -inline constexpr std::string_view compressed = "The quick brown fox jumps over the lazy dog"; +inline constexpr std::string_view compressed = + "The quick brown fox jumps over the lazy dog"; template <> inline constexpr std::string_view compressed{ -"\x1F\x8B\x08\x04\x00\x00\x00\x00\x00\x00\x06\x00\x42\x43" -"\x02\x00\x45\x00\x0B\xC9\x48\x55\x28\x2C\xCD\x4C\xCE\x56" -"\x48\x2A\xCA\x2F\xCF\x53\x48\xCB\xAF\x50\xC8\x2A\xCD\x2D" -"\x28\x56\xC8\x2F\x4B\x2D\x52\x28\x01\x4A\xE7\x24\x56\x55" -"\x2A\xA4\xE4\xA7\x03\x00\x39\xA3\x4F\x41\x2B\x00\x00\x00" -"\x1F\x8B\x08\x04\x00\x00\x00\x00\x00\xFF\x06\x00\x42\x43" -"\x02\x00\x1B\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00", 98}; + "\x1F\x8B\x08\x04\x00\x00\x00\x00\x00\x00\x06\x00\x42\x43" + "\x02\x00\x45\x00\x0B\xC9\x48\x55\x28\x2C\xCD\x4C\xCE\x56" + "\x48\x2A\xCA\x2F\xCF\x53\x48\xCB\xAF\x50\xC8\x2A\xCD\x2D" + "\x28\x56\xC8\x2F\x4B\x2D\x52\x28\x01\x4A\xE7\x24\x56\x55" + "\x2A\xA4\xE4\xA7\x03\x00\x39\xA3\x4F\x41\x2B\x00\x00\x00" + "\x1F\x8B\x08\x04\x00\x00\x00\x00\x00\xFF\x06\x00\x42\x43" + "\x02\x00\x1B\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00", + 98}; template <> inline constexpr std::string_view compressed{ -"\x42\x5A\x68\x39\x31\x41\x59\x26\x53\x59\x45\x9D\xEE\x61\x00\x00" -"\x04\x13\x80\x40\x00\x04\x00\x3F\xFF\xFF\xF0\x20\x00\x31\x46\x86" -"\x80\x00\x00\x31\xE9\xA9\xA6\x4C\x86\x11\xB4\x6D\x47\x62\x62\x08" -"\x49\xED\x7A\xA1\x53\x65\x65\xB1\x25\xE3\xE2\x60\xB1\xF8\x98\x39" -"\xDD\x4C\x09\x6F\x9C\xE8\x5D\xC9\x14\xE1\x42\x41\x16\x77\xB9\x84", 80}; + "\x42\x5A\x68\x39\x31\x41\x59\x26\x53\x59\x45\x9D\xEE\x61\x00\x00" + "\x04\x13\x80\x40\x00\x04\x00\x3F\xFF\xFF\xF0\x20\x00\x31\x46\x86" + "\x80\x00\x00\x31\xE9\xA9\xA6\x4C\x86\x11\xB4\x6D\x47\x62\x62\x08" + "\x49\xED\x7A\xA1\x53\x65\x65\xB1\x25\xE3\xE2\x60\xB1\xF8\x98\x39" + "\xDD\x4C\x09\x6F\x9C\xE8\x5D\xC9\x14\xE1\x42\x41\x16\x77\xB9\x84", + 80}; template <> inline constexpr std::string_view compressed{ -"\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x0b\xc9\x48\x55\x28\x2c" -"\xcd\x4c\xce\x56\x48\x2a\xca\x2f\xcf\x53\x48\xcb\xaf\x50\xc8\x2a" -"\xcd\x2d\x28\x56\xc8\x2f\x4b\x2d\x52\x28\x01\x4a\xe7\x24\x56\x55" -"\x2a\xa4\xe4\xa7\x03\x00\x39\xa3\x4f\x41\x2b\x00\x00\x00", 62}; + "\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x0b\xc9\x48\x55\x28\x2c" + "\xcd\x4c\xce\x56\x48\x2a\xca\x2f\xcf\x53\x48\xcb\xaf\x50\xc8\x2a" + "\xcd\x2d\x28\x56\xc8\x2f\x4b\x2d\x52\x28\x01\x4a\xe7\x24\x56\x55" + "\x2a\xa4\xe4\xa7\x03\x00\x39\xa3\x4f\x41\x2b\x00\x00\x00", + 62}; diff --git a/test/unit/stream/istream_test_template.hpp b/test/unit/stream/istream_test_template.hpp index b9cd92c..1f03e7c 100644 --- a/test/unit/stream/istream_test_template.hpp +++ b/test/unit/stream/istream_test_template.hpp @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include #include @@ -31,8 +31,8 @@ void regular() } std::ifstream fi{filename.get_path(), std::ios::binary}; - stream_t comp{fi}; - std::string buffer{std::istreambuf_iterator{comp}, std::istreambuf_iterator{}}; + stream_t comp{fi}; + std::string buffer{std::istreambuf_iterator{comp}, std::istreambuf_iterator{}}; EXPECT_EQ(buffer, uncompressed); } @@ -48,9 +48,9 @@ void type_erased() fi << compressed; } - std::ifstream fi{filename.get_path(), std::ios::binary}; + std::ifstream fi{filename.get_path(), std::ios::binary}; std::unique_ptr comp{new stream_t{fi}}; - std::string buffer{std::istreambuf_iterator{*comp}, std::istreambuf_iterator{}}; + std::string buffer{std::istreambuf_iterator{*comp}, std::istreambuf_iterator{}}; EXPECT_EQ(buffer, uncompressed); } diff --git a/test/unit/stream/ostream_test_template.hpp b/test/unit/stream/ostream_test_template.hpp index 2d2eb66..55d5c85 100644 --- a/test/unit/stream/ostream_test_template.hpp +++ b/test/unit/stream/ostream_test_template.hpp @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include #include @@ -40,7 +40,7 @@ void regular() } std::ifstream fi{filename.get_path(), std::ios::binary}; - std::string buffer{std::istreambuf_iterator{fi}, std::istreambuf_iterator{}}; + std::string buffer{std::istreambuf_iterator{fi}, std::istreambuf_iterator{}}; if constexpr (f == bio::compression_format::bgzf) buffer[9] = '\x00'; // zero-out the OS byte. @@ -58,7 +58,9 @@ void type_erased() if constexpr (std::same_as) { - std::unique_ptr ogzf{new stream_t{of, {.compression = f}}}; + std::unique_ptr ogzf{ + new stream_t{of, {.compression = f}} + }; *ogzf << uncompressed << std::flush; } else @@ -69,11 +71,10 @@ void type_erased() } std::ifstream fi{filename.get_path(), std::ios::binary}; - std::string buffer{std::istreambuf_iterator{fi}, std::istreambuf_iterator{}}; + std::string buffer{std::istreambuf_iterator{fi}, std::istreambuf_iterator{}}; if constexpr (f == bio::compression_format::bgzf) buffer[9] = '\x00'; // zero-out the OS byte. EXPECT_EQ(buffer, compressed); } - diff --git a/test/unit/var_io/var_io_header_test.cpp b/test/unit/var_io/var_io_header_test.cpp index 431d788..41f5d39 100644 --- a/test/unit/var_io/var_io_header_test.cpp +++ b/test/unit/var_io/var_io_header_test.cpp @@ -51,7 +51,7 @@ TEST(var_io_header, spec_from_text) // info 0 EXPECT_EQ(hdr.infos[0].id, "NS"); EXPECT_EQ(hdr.infos[0].number, 1); - EXPECT_EQ(hdr.infos[0].type, bio::var_io::dynamic_type_id::int32); + EXPECT_EQ(hdr.infos[0].type, bio::var_io::value_type_id::int32); EXPECT_EQ(hdr.infos[0].description, "\"Number of Samples With Data\""); EXPECT_EQ(hdr.infos[0].other_fields.size(), 0); EXPECT_EQ(hdr.infos[0].idx, 1); @@ -59,7 +59,7 @@ TEST(var_io_header, spec_from_text) // info 1 EXPECT_EQ(hdr.infos[1].id, "DP"); EXPECT_EQ(hdr.infos[1].number, 1); - EXPECT_EQ(hdr.infos[1].type, bio::var_io::dynamic_type_id::int32); + EXPECT_EQ(hdr.infos[1].type, bio::var_io::value_type_id::int32); EXPECT_EQ(hdr.infos[1].description, "\"Total Depth\""); EXPECT_EQ(hdr.infos[1].other_fields.size(), 0); EXPECT_EQ(hdr.infos[1].idx, 2); @@ -67,7 +67,7 @@ TEST(var_io_header, spec_from_text) // info 2 EXPECT_EQ(hdr.infos[2].id, "AF"); EXPECT_EQ(hdr.infos[2].number, bio::var_io::header_number::A); - EXPECT_EQ(hdr.infos[2].type, bio::var_io::dynamic_type_id::vector_of_float32); + EXPECT_EQ(hdr.infos[2].type, bio::var_io::value_type_id::vector_of_float32); EXPECT_EQ(hdr.infos[2].description, "\"Allele Frequency\""); EXPECT_EQ(hdr.infos[2].other_fields.size(), 0); EXPECT_EQ(hdr.infos[2].idx, 3); @@ -75,7 +75,7 @@ TEST(var_io_header, spec_from_text) // info 3 EXPECT_EQ(hdr.infos[3].id, "AA"); EXPECT_EQ(hdr.infos[3].number, 1); - EXPECT_EQ(hdr.infos[3].type, bio::var_io::dynamic_type_id::string); + EXPECT_EQ(hdr.infos[3].type, bio::var_io::value_type_id::string); EXPECT_EQ(hdr.infos[3].description, "\"Ancestral Allele\""); EXPECT_EQ(hdr.infos[3].other_fields.size(), 0); EXPECT_EQ(hdr.infos[3].idx, 4); @@ -83,7 +83,7 @@ TEST(var_io_header, spec_from_text) // info 4 EXPECT_EQ(hdr.infos[4].id, "DB"); EXPECT_EQ(hdr.infos[4].number, 0); - EXPECT_EQ(hdr.infos[4].type, bio::var_io::dynamic_type_id::flag); + EXPECT_EQ(hdr.infos[4].type, bio::var_io::value_type_id::flag); EXPECT_EQ(hdr.infos[4].description, "\"dbSNP membership, build 129\""); EXPECT_EQ(hdr.infos[4].other_fields.size(), 0); EXPECT_EQ(hdr.infos[4].idx, 5); @@ -91,7 +91,7 @@ TEST(var_io_header, spec_from_text) // info 5 EXPECT_EQ(hdr.infos[5].id, "H2"); EXPECT_EQ(hdr.infos[5].number, 0); - EXPECT_EQ(hdr.infos[5].type, bio::var_io::dynamic_type_id::flag); + EXPECT_EQ(hdr.infos[5].type, bio::var_io::value_type_id::flag); EXPECT_EQ(hdr.infos[5].description, "\"HapMap2 membership\""); EXPECT_EQ(hdr.infos[5].other_fields.size(), 0); EXPECT_EQ(hdr.infos[5].idx, 6); @@ -102,7 +102,7 @@ TEST(var_io_header, spec_from_text) // format 0 EXPECT_EQ(hdr.formats[0].id, "GT"); EXPECT_EQ(hdr.formats[0].number, 1); - EXPECT_EQ(hdr.formats[0].type, bio::var_io::dynamic_type_id::string); + EXPECT_EQ(hdr.formats[0].type, bio::var_io::value_type_id::string); EXPECT_EQ(hdr.formats[0].description, "\"Genotype\""); EXPECT_EQ(hdr.formats[0].other_fields.size(), 0); EXPECT_EQ(hdr.formats[0].idx, 9); @@ -110,7 +110,7 @@ TEST(var_io_header, spec_from_text) // format 1 EXPECT_EQ(hdr.formats[1].id, "GQ"); EXPECT_EQ(hdr.formats[1].number, 1); - EXPECT_EQ(hdr.formats[1].type, bio::var_io::dynamic_type_id::int32); + EXPECT_EQ(hdr.formats[1].type, bio::var_io::value_type_id::int32); EXPECT_EQ(hdr.formats[1].description, "\"Genotype Quality\""); EXPECT_EQ(hdr.formats[1].other_fields.size(), 0); EXPECT_EQ(hdr.formats[1].idx, 10); @@ -118,7 +118,7 @@ TEST(var_io_header, spec_from_text) // format 2 EXPECT_EQ(hdr.formats[2].id, "DP"); EXPECT_EQ(hdr.formats[2].number, 1); - EXPECT_EQ(hdr.formats[2].type, bio::var_io::dynamic_type_id::int32); + EXPECT_EQ(hdr.formats[2].type, bio::var_io::value_type_id::int32); EXPECT_EQ(hdr.formats[2].description, "\"Read Depth\""); EXPECT_EQ(hdr.formats[2].other_fields.size(), 0); EXPECT_EQ(hdr.formats[2].idx, 2); @@ -126,7 +126,7 @@ TEST(var_io_header, spec_from_text) // format 3 EXPECT_EQ(hdr.formats[3].id, "HQ"); EXPECT_EQ(hdr.formats[3].number, 2); - EXPECT_EQ(hdr.formats[3].type, bio::var_io::dynamic_type_id::vector_of_int32); + EXPECT_EQ(hdr.formats[3].type, bio::var_io::value_type_id::vector_of_int32); EXPECT_EQ(hdr.formats[3].description, "\"Haplotype Quality\""); EXPECT_EQ(hdr.formats[3].other_fields.size(), 0); EXPECT_EQ(hdr.formats[3].idx, 11);