SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <bit>
16#include <iterator>
17#include <ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
49class format_bam : private detail::format_sam_base
50{
51public:
55 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56 format_bam() = default;
57 format_bam(format_bam const &) = default;
58 format_bam & operator=(format_bam const &) = default;
59 format_bam(format_bam &&) = default;
61 ~format_bam() = default;
62
64
67
68protected:
69 template <typename stream_type, // constraints checked by file
70 typename seq_legal_alph_type,
71 typename ref_seqs_type,
72 typename ref_ids_type,
73 typename stream_pos_type,
74 typename seq_type,
75 typename id_type,
76 typename offset_type,
77 typename ref_seq_type,
78 typename ref_id_type,
79 typename ref_offset_type,
80 typename align_type,
81 typename cigar_type,
82 typename flag_type,
83 typename mapq_type,
84 typename qual_type,
85 typename mate_type,
86 typename tag_dict_type,
87 typename e_value_type,
88 typename bit_score_type>
89 void read_alignment_record(stream_type & stream,
90 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
91 ref_seqs_type & ref_seqs,
93 stream_pos_type & position_buffer,
94 seq_type & seq,
95 qual_type & qual,
96 id_type & id,
97 offset_type & offset,
98 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
99 ref_id_type & ref_id,
100 ref_offset_type & ref_offset,
101 align_type & align,
102 cigar_type & cigar_vector,
103 flag_type & flag,
104 mapq_type & mapq,
105 mate_type & mate,
106 tag_dict_type & tag_dict,
107 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
108 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
109
110 template <typename stream_type,
111 typename header_type,
112 typename seq_type,
113 typename id_type,
114 typename ref_seq_type,
115 typename ref_id_type,
116 typename align_type,
117 typename cigar_type,
118 typename qual_type,
119 typename mate_type,
120 typename tag_dict_type>
121 void write_alignment_record([[maybe_unused]] stream_type & stream,
122 [[maybe_unused]] sam_file_output_options const & options,
123 [[maybe_unused]] header_type && header,
124 [[maybe_unused]] seq_type && seq,
125 [[maybe_unused]] qual_type && qual,
126 [[maybe_unused]] id_type && id,
127 [[maybe_unused]] int32_t const offset,
128 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
129 [[maybe_unused]] ref_id_type && ref_id,
130 [[maybe_unused]] std::optional<int32_t> ref_offset,
131 [[maybe_unused]] align_type && align,
132 [[maybe_unused]] cigar_type && cigar_vector,
133 [[maybe_unused]] sam_flag const flag,
134 [[maybe_unused]] uint8_t const mapq,
135 [[maybe_unused]] mate_type && mate,
136 [[maybe_unused]] tag_dict_type && tag_dict,
137 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
138 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
139
140private:
142 bool header_was_read{false};
143
145 std::string string_buffer{};
146
148 struct alignment_record_core
149 { // naming corresponds to official SAM/BAM specifications
150 int32_t block_size;
151 int32_t refID;
152 int32_t pos;
153 uint32_t l_read_name : 8;
154 uint32_t mapq : 8;
155 uint32_t bin : 16;
156 uint32_t n_cigar_op : 16;
157 sam_flag flag;
158 int32_t l_seq;
159 int32_t next_refID;
160 int32_t next_pos;
161 int32_t tlen;
162 };
163
164 // clang-format off
166 static constexpr std::array<uint8_t, 256> char_to_sam_rank
167 {
168 []() constexpr {
170
171 using index_t = std::make_unsigned_t<char>;
172
173 // ret['M'] = 0; set anyway by initialization
174 ret[static_cast<index_t>('I')] = 1;
175 ret[static_cast<index_t>('D')] = 2;
176 ret[static_cast<index_t>('N')] = 3;
177 ret[static_cast<index_t>('S')] = 4;
178 ret[static_cast<index_t>('H')] = 5;
179 ret[static_cast<index_t>('P')] = 6;
180 ret[static_cast<index_t>('=')] = 7;
181 ret[static_cast<index_t>('X')] = 8;
182
183 return ret;
184 }()
185 };
186 // clang-format on
187
189 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
190 {
191 --end;
192 if (beg >> 14 == end >> 14)
193 return ((1 << 15) - 1) / 7 + (beg >> 14);
194 if (beg >> 17 == end >> 17)
195 return ((1 << 12) - 1) / 7 + (beg >> 17);
196 if (beg >> 20 == end >> 20)
197 return ((1 << 9) - 1) / 7 + (beg >> 20);
198 if (beg >> 23 == end >> 23)
199 return ((1 << 6) - 1) / 7 + (beg >> 23);
200 if (beg >> 26 == end >> 26)
201 return ((1 << 3) - 1) / 7 + (beg >> 26);
202 return 0;
203 }
204
211 template <typename stream_view_type, std::integral number_type>
212 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
213 {
214 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
215 }
216
222 template <typename stream_view_type>
223 void read_float_byte_field(stream_view_type && stream_view, float & target)
224 {
225 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
226 }
227
228 template <typename stream_view_type, typename value_type>
229 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
230 stream_view_type && stream_view,
231 value_type const & SEQAN3_DOXYGEN_ONLY(value));
232
233 template <typename stream_view_type>
234 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
235
236 template <typename cigar_input_type>
237 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
238
239 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
240};
241
243template <typename stream_type, // constraints checked by file
244 typename seq_legal_alph_type,
245 typename ref_seqs_type,
246 typename ref_ids_type,
247 typename stream_pos_type,
248 typename seq_type,
249 typename id_type,
250 typename offset_type,
251 typename ref_seq_type,
252 typename ref_id_type,
253 typename ref_offset_type,
254 typename align_type,
255 typename cigar_type,
256 typename flag_type,
257 typename mapq_type,
258 typename qual_type,
259 typename mate_type,
260 typename tag_dict_type,
261 typename e_value_type,
262 typename bit_score_type>
263inline void
265 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
266 ref_seqs_type & ref_seqs,
268 stream_pos_type & position_buffer,
269 seq_type & seq,
270 qual_type & qual,
271 id_type & id,
272 offset_type & offset,
273 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
274 ref_id_type & ref_id,
275 ref_offset_type & ref_offset,
276 align_type & align,
277 cigar_type & cigar_vector,
278 flag_type & flag,
279 mapq_type & mapq,
280 mate_type & mate,
281 tag_dict_type & tag_dict,
282 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
283 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
284{
285 static_assert(detail::decays_to_ignore_v<ref_offset_type>
286 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
287 "The ref_offset must be a specialisation of std::optional.");
288
289 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
290 "The type of field::mapq must be uint8_t.");
291
292 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
293 "The type of field::flag must be seqan3::sam_flag.");
294
295 auto stream_view = seqan3::detail::istreambuf(stream);
296
297 // these variables need to be stored to compute the ALIGNMENT
298 [[maybe_unused]] int32_t offset_tmp{};
299 [[maybe_unused]] int32_t soft_clipping_end{};
300 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
301 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
302
303 // Header
304 // -------------------------------------------------------------------------------------------------------------
305 if (!header_was_read)
306 {
307 // magic BAM string
308 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
309 throw format_error{"File is not in BAM format."};
310
311 int32_t l_text{}; // length of header text including \0 character
312 int32_t n_ref{}; // number of reference sequences
313 int32_t l_name{}; // 1 + length of reference name including \0 character
314 int32_t l_ref{}; // length of reference sequence
315
316 read_integral_byte_field(stream_view, l_text);
317
318 if (l_text > 0) // header text is present
319 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
320
321 read_integral_byte_field(stream_view, n_ref);
322
323 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
324 {
325 read_integral_byte_field(stream_view, l_name);
326
327 string_buffer.resize(l_name - 1);
329 l_name - 1,
330 string_buffer.data()); // copy without \0 character
331 ++std::ranges::begin(stream_view); // skip \0 character
332
333 read_integral_byte_field(stream_view, l_ref);
334
335 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
336 {
337 // If there was no header text, we parse reference sequences block as header information
338 if (l_text == 0)
339 {
340 auto & reference_ids = header.ref_ids();
341 // put the length of the reference sequence into ref_id_info
342 header.ref_id_info.emplace_back(l_ref, "");
343 // put the reference name into reference_ids
344 reference_ids.push_back(string_buffer);
345 // assign the reference name an ascending reference id (starts at index 0).
346 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
347 continue;
348 }
349 }
350
351 auto id_it = header.ref_dict.find(string_buffer);
352
353 // sanity checks of reference information to existing header object:
354 if (id_it == header.ref_dict.end()) // [unlikely]
355 {
356 throw format_error{detail::to_string("Unknown reference name '" + string_buffer
357 + "' found in BAM file header (header.ref_ids():",
358 header.ref_ids(),
359 ").")};
360 }
361 else if (id_it->second != ref_idx) // [unlikely]
362 {
363 throw format_error{detail::to_string("Reference id '",
364 string_buffer,
365 "' at position ",
366 ref_idx,
367 " does not correspond to the position ",
368 id_it->second,
369 " in the header (header.ref_ids():",
370 header.ref_ids(),
371 ").")};
372 }
373 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
374 {
375 throw format_error{"Provided reference has unequal length as specified in the header."};
376 }
377 }
378
379 header_was_read = true;
380
381 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
382 return;
383 }
384
385 // read alignment record into buffer
386 // -------------------------------------------------------------------------------------------------------------
387 position_buffer = stream.tellg();
388 alignment_record_core core;
389 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
390
391 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
392 {
393 throw format_error{detail::to_string("Reference id index '",
394 core.refID,
395 "' is not in range of ",
396 "header.ref_ids(), which has size ",
397 header.ref_ids().size(),
398 ".")};
399 }
400 else if (core.refID > -1) // not unmapped
401 {
402 ref_id = core.refID; // field::ref_id
403 }
404
405 flag = core.flag; // field::flag
406 mapq = core.mapq; // field::mapq
407
408 if (core.pos > -1) // [[likely]]
409 ref_offset = core.pos; // field::ref_offset
410
411 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
412 {
413 if (core.next_refID > -1)
414 get<0>(mate) = core.next_refID;
415
416 if (core.next_pos > -1) // [[likely]]
417 get<1>(mate) = core.next_pos;
418
419 get<2>(mate) = core.tlen;
420 }
421
422 // read id
423 // -------------------------------------------------------------------------------------------------------------
424 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
425 if constexpr (!detail::decays_to_ignore_v<id_type>)
426 read_forward_range_field(id_view, id); // field::id
427 else
428 detail::consume(id_view);
429 ++std::ranges::begin(stream_view); // skip '\0'
430
431 // read cigar string
432 // -------------------------------------------------------------------------------------------------------------
433 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
434 {
435 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
436 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
437 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
438 }
439 else
440 {
441 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
442 }
443
444 offset = offset_tmp;
445
446 // read sequence
447 // -------------------------------------------------------------------------------------------------------------
448 if (core.l_seq > 0) // sequence information is given
449 {
450 auto seq_stream = stream_view | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
453 {
454 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
455 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
456 });
457
458 if constexpr (detail::decays_to_ignore_v<seq_type>)
459 {
460 auto skip_sequence_bytes = [&]()
461 {
462 detail::consume(seq_stream);
463 if (core.l_seq & 1)
464 ++std::ranges::begin(stream_view);
465 };
466
467 if constexpr (!detail::decays_to_ignore_v<align_type>)
468 {
470 "If you want to read ALIGNMENT but not SEQ, the alignment"
471 " object must store a sequence container at the second (query) position.");
472
473 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
474 {
475 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
476 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
477 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
478
479 get<1>(align).reserve(seq_length);
480
481 auto tmp_iter = std::ranges::begin(seq_stream);
482 std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
483
484 if (offset_tmp & 1)
485 {
486 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
487 ++tmp_iter;
488 --seq_length;
489 }
490
491 for (size_t i = (seq_length / 2); i > 0; --i)
492 {
493 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
494 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
495 ++tmp_iter;
496 }
497
498 if (seq_length & 1)
499 {
500 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
501 ++tmp_iter;
502 --soft_clipping_end;
503 }
504
505 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
506 }
507 else
508 {
509 skip_sequence_bytes();
510 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
511 }
512 }
513 else
514 {
515 skip_sequence_bytes();
516 }
517 }
518 else
519 {
520 using alph_t = std::ranges::range_value_t<decltype(seq)>;
521 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
522
523 for (auto [d1, d2] : seq_stream)
524 {
525 seq.push_back(from_dna16[to_rank(d1)]);
526 seq.push_back(from_dna16[to_rank(d2)]);
527 }
528
529 if (core.l_seq & 1)
530 {
531 dna16sam d =
532 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
533 seq.push_back(from_dna16[to_rank(d)]);
534 ++std::ranges::begin(stream_view);
535 }
536
537 if constexpr (!detail::decays_to_ignore_v<align_type>)
538 {
539 assign_unaligned(get<1>(align),
540 seq
541 | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
542 std::ranges::distance(seq) - soft_clipping_end));
543 }
544 }
545 }
546
547 // read qual string
548 // -------------------------------------------------------------------------------------------------------------
549 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
551 [](char chr)
552 {
553 return static_cast<char>(chr + 33);
554 });
555 if constexpr (!detail::decays_to_ignore_v<qual_type>)
556 read_forward_range_field(qual_view, qual);
557 else
558 detail::consume(qual_view);
559
560 // All remaining optional fields if any: SAM tags dictionary
561 // -------------------------------------------------------------------------------------------------------------
562 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4 /*block_size excluded*/)
563 - core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
564 assert(remaining_bytes >= 0);
565 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
566
567 while (tags_view.size() > 0)
568 {
569 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
570 read_sam_dict_field(tags_view, tag_dict);
571 else
572 detail::consume(tags_view);
573 }
574
575 // DONE READING - wrap up
576 // -------------------------------------------------------------------------------------------------------------
577 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
578 {
579 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
580 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
581 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
582 if (core.l_seq != 0 && offset_tmp == core.l_seq)
583 {
584 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
585 { // maybe only throw in debug mode and otherwise return an empty alignment?
586 throw format_error{
587 detail::to_string("The cigar string '",
588 offset_tmp,
589 "S",
590 ref_length,
591 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
592 "stored in the optional field CG. You need to read in the field::tags and "
593 "field::seq in order to access this information.")};
594 }
595 else
596 {
597 auto it = tag_dict.find("CG"_tag);
598
599 if (it == tag_dict.end())
600 throw format_error{detail::to_string(
601 "The cigar string '",
602 offset_tmp,
603 "S",
604 ref_length,
605 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
606 "stored in the optional field CG but this tag is not present in the given ",
607 "record.")};
608
609 auto cigar_view = std::views::all(std::get<std::string>(it->second));
610 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
611 offset_tmp = soft_clipping_end = 0;
612 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
613 tag_dict.erase(it); // remove redundant information
614
615 if constexpr (!detail::decays_to_ignore_v<align_type>)
616 {
617 assign_unaligned(
618 get<1>(align),
619 seq
620 | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
621 std::ranges::distance(seq) - soft_clipping_end));
622 }
623 }
624 }
625 }
626
627 // Alignment object construction
628 if constexpr (!detail::decays_to_ignore_v<align_type>)
629 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
630
631 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
632 std::swap(cigar_vector, tmp_cigar_vector);
633}
634
636template <typename stream_type,
637 typename header_type,
638 typename seq_type,
639 typename id_type,
640 typename ref_seq_type,
641 typename ref_id_type,
642 typename align_type,
643 typename cigar_type,
644 typename qual_type,
645 typename mate_type,
646 typename tag_dict_type>
647inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
648 [[maybe_unused]] sam_file_output_options const & options,
649 [[maybe_unused]] header_type && header,
650 [[maybe_unused]] seq_type && seq,
651 [[maybe_unused]] qual_type && qual,
652 [[maybe_unused]] id_type && id,
653 [[maybe_unused]] int32_t const offset,
654 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
655 [[maybe_unused]] ref_id_type && ref_id,
656 [[maybe_unused]] std::optional<int32_t> ref_offset,
657 [[maybe_unused]] align_type && align,
658 [[maybe_unused]] cigar_type && cigar_vector,
659 [[maybe_unused]] sam_flag const flag,
660 [[maybe_unused]] uint8_t const mapq,
661 [[maybe_unused]] mate_type && mate,
662 [[maybe_unused]] tag_dict_type && tag_dict,
663 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
664 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
665{
666 // ---------------------------------------------------------------------
667 // Type Requirements (as static asserts for user friendliness)
668 // ---------------------------------------------------------------------
669 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
670 "The seq object must be a std::ranges::forward_range over "
671 "letters that model seqan3::alphabet.");
672
673 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
674 "The id object must be a std::ranges::forward_range over "
675 "letters that model seqan3::alphabet.");
676
677 static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
678 "The ref_seq object must be a std::ranges::forward_range "
679 "over letters that model seqan3::alphabet.");
680
681 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
682 {
683 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
684 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
685 "The ref_id object must be a std::ranges::forward_range "
686 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
687 }
688
690 "The align object must be a std::pair of two ranges whose "
691 "value_type is comparable to seqan3::gap");
692
693 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2
694 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>>
695 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
696 "The align object must be a std::pair of two ranges whose "
697 "value_type is comparable to seqan3::gap");
698
699 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
700 "The qual object must be a std::ranges::forward_range "
701 "over letters that model seqan3::alphabet.");
702
704 "The mate object must be a std::tuple of size 3 with "
705 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
706 "2) a std::integral or std::optional<std::integral>, and "
707 "3) a std::integral.");
708
709 static_assert(
710 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
711 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
712 || detail::is_type_specialisation_of_v<
713 std::remove_cvref_t<decltype(std::get<0>(mate))>,
714 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
715 || detail::is_type_specialisation_of_v<
716 std::remove_cvref_t<decltype(std::get<1>(mate))>,
717 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
718 "The mate object must be a std::tuple of size 3 with "
719 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
720 "2) a std::integral or std::optional<std::integral>, and "
721 "3) a std::integral.");
722
723 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
724 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
725
726 if constexpr (detail::decays_to_ignore_v<header_type>)
727 {
728 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
729 "You can either construct the output file with ref_ids and ref_seqs information and "
730 "the header will be created for you, or you can access the `header` member directly."};
731 }
732 else
733 {
734 // ---------------------------------------------------------------------
735 // logical Requirements
736 // ---------------------------------------------------------------------
737
738 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
739 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
740
741 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
742
743 // ---------------------------------------------------------------------
744 // Writing the BAM Header on first call
745 // ---------------------------------------------------------------------
746 if (!header_was_written)
747 {
748 stream << "BAM\1";
750 write_header(os, options, header); // write SAM header to temporary stream to query the size.
751 int32_t l_text{static_cast<int32_t>(os.str().size())};
752 std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
753
754 stream << os.str();
755
756 int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
757 std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
758
759 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
760 {
761 int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
762 std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
763 // write reference name:
764 std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
765 stream_it = '\0';
766 // write reference sequence length:
767 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
768 }
769
770 header_was_written = true;
771 }
772
773 // ---------------------------------------------------------------------
774 // Writing the Record
775 // ---------------------------------------------------------------------
776 int32_t ref_length{};
777
778 // if alignment is non-empty, replace cigar_vector.
779 // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
780 if (!std::ranges::empty(cigar_vector))
781 {
782 int32_t dummy_seq_length{};
783 for (auto & [count, operation] : cigar_vector)
784 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
785 }
786 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
787 {
788 ref_length = std::ranges::distance(get<1>(align));
789
790 // compute possible distance from alignment end to sequence end
791 // which indicates soft clipping at the end.
792 // This should be replaced by a free count_gaps function for
793 // aligned sequences which is more efficient if possible.
794 int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
795
796 for (auto chr : get<1>(align))
797 if (chr == gap{})
798 ++off_end;
799
800 off_end -= ref_length;
801 cigar_vector = detail::get_cigar_vector(align, offset, off_end);
802 }
803
804 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
805 {
806 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
807 cigar_vector.resize(2);
808 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
809 cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
810 }
811
812 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
813
814 // Compute the value for the l_read_name field for the bam record.
815 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
816 // the data type to store the value is uint8_t and 255 is the maximal size.
817 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
818 // 2 bytes.
819 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
820 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
821
822 alignment_record_core core{/* block_size */ 0, // will be initialised right after
823 /* refID */ -1, // will be initialised right after
824 /* pos */ ref_offset.value_or(-1),
825 /* l_read_name */ read_name_size,
826 /* mapq */ mapq,
827 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
828 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
829 /* flag */ flag,
830 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
831 /* next_refId */ -1, // will be initialised right after
832 /* next_pos */ get<1>(mate).value_or(-1),
833 /* tlen */ get<2>(mate)};
834
835 auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
836 {
837 using id_t = std::remove_reference_t<decltype(id_source)>;
838
839 if constexpr (!detail::decays_to_ignore_v<id_t>)
840 {
841 if constexpr (std::integral<id_t>)
842 {
843 id_target = id_source;
844 }
845 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
846 {
847 id_target = id_source.value_or(-1);
848 }
849 else
850 {
851 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
852 {
853 auto id_it = header.ref_dict.end();
854
855 if constexpr (std::ranges::contiguous_range<decltype(id_source)>
856 && std::ranges::sized_range<decltype(id_source)>
857 && std::ranges::borrowed_range<decltype(id_source)>)
858 {
859 id_it = header.ref_dict.find(
860 std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
861 }
862 else
863 {
864 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
865
866 static_assert(
867 implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
868 "The ref_id type is not convertible to the reference id information stored in the "
869 "reference dictionary of the header object.");
870
871 id_it = header.ref_dict.find(id_source);
872 }
873
874 if (id_it == header.ref_dict.end())
875 {
876 throw format_error{detail::to_string("Unknown reference name '",
877 id_source,
878 "' could "
879 "not be found in BAM header ref_dict: ",
880 header.ref_dict,
881 ".")};
882 }
883
884 id_target = id_it->second;
885 }
886 }
887 }
888 };
889
890 // initialise core.refID
891 check_and_assign_id_to(ref_id, core.refID);
892
893 // initialise core.next_refID
894 check_and_assign_id_to(get<0>(mate), core.next_refID);
895
896 // initialise core.block_size
897 core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
898 + // each int32_t has 4 bytes
899 (core.l_seq + 1) / 2 + // bitcompressed seq
900 core.l_seq + // quality string
901 tag_dict_binary_str.size();
902
903 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
904
905 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
906 stream_it = '*';
907 else
908 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
909 stream_it = '\0';
910
911 // write cigar
912 for (auto [cigar_count, op] : cigar_vector)
913 {
914 cigar_count = cigar_count << 4;
915 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
916 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
917 }
918
919 // write seq (bit-compressed: dna16sam characters go into one byte)
920 using alph_t = std::ranges::range_value_t<seq_type>;
921 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
922
923 auto sit = std::ranges::begin(seq);
924 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
925 {
926 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
927 ++sidx, ++sit;
928 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
929 stream_it = static_cast<char>(compressed_chr);
930 }
931
932 if (core.l_seq & 1) // write one more
933 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
934
935 // write qual
936 if (std::ranges::empty(qual))
937 {
938 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
939 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
940 }
941 else
942 {
943 if (std::ranges::distance(qual) != core.l_seq)
944 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
945 core.l_seq,
946 ". Got quality with size ",
947 std::ranges::distance(qual),
948 " instead.")};
949
950 auto v = qual
952 [](auto chr)
953 {
954 return static_cast<char>(to_rank(chr));
955 });
956 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
957 }
958
959 // write optional fields
960 stream << tag_dict_binary_str;
961 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
962}
963
965template <typename stream_view_type, typename value_type>
966inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
967 stream_view_type && stream_view,
968 value_type const & SEQAN3_DOXYGEN_ONLY(value))
969{
970 int32_t count;
971 read_integral_byte_field(stream_view, count); // read length of vector
972 std::vector<value_type> tmp_vector;
973 tmp_vector.reserve(count);
974
975 while (count > 0)
976 {
977 value_type tmp{};
978 if constexpr (std::integral<value_type>)
979 {
980 read_integral_byte_field(stream_view, tmp);
981 }
982 else if constexpr (std::same_as<value_type, float>)
983 {
984 read_float_byte_field(stream_view, tmp);
985 }
986 else
987 {
988 constexpr bool always_false = std::is_same_v<value_type, void>;
989 static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
990 }
991 tmp_vector.push_back(std::move(tmp));
992 --count;
993 }
994 variant = std::move(tmp_vector);
995}
996
1014template <typename stream_view_type>
1015inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
1016{
1017 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
1018 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
1019 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
1020 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
1021 by the length (int32_t) of the array, followed by the values.
1022 */
1023 auto it = std::ranges::begin(stream_view);
1024 uint16_t tag = static_cast<uint16_t>(*it) << 8;
1025 ++it; // skip char read before
1026
1027 tag += static_cast<uint16_t>(*it);
1028 ++it; // skip char read before
1029
1030 char type_id = *it;
1031 ++it; // skip char read before
1032
1033 switch (type_id)
1034 {
1035 case 'A': // char
1036 {
1037 target[tag] = *it;
1038 ++it; // skip char that has been read
1039 break;
1040 }
1041 // all integer sizes are possible
1042 case 'c': // int8_t
1043 {
1044 int8_t tmp;
1045 read_integral_byte_field(stream_view, tmp);
1046 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1047 break;
1048 }
1049 case 'C': // uint8_t
1050 {
1051 uint8_t tmp;
1052 read_integral_byte_field(stream_view, tmp);
1053 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1054 break;
1055 }
1056 case 's': // int16_t
1057 {
1058 int16_t tmp;
1059 read_integral_byte_field(stream_view, tmp);
1060 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1061 break;
1062 }
1063 case 'S': // uint16_t
1064 {
1065 uint16_t tmp;
1066 read_integral_byte_field(stream_view, tmp);
1067 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1068 break;
1069 }
1070 case 'i': // int32_t
1071 {
1072 int32_t tmp;
1073 read_integral_byte_field(stream_view, tmp);
1074 target[tag] = std::move(tmp); // readable sam format only allows int32_t
1075 break;
1076 }
1077 case 'I': // uint32_t
1078 {
1079 uint32_t tmp;
1080 read_integral_byte_field(stream_view, tmp);
1081 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1082 break;
1083 }
1084 case 'f': // float
1085 {
1086 float tmp;
1087 read_float_byte_field(stream_view, tmp);
1088 target[tag] = tmp;
1089 break;
1090 }
1091 case 'Z': // string
1092 {
1093 string_buffer.clear();
1094 while (!is_char<'\0'>(*it))
1095 {
1096 string_buffer.push_back(*it);
1097 ++it;
1098 }
1099 ++it; // skip \0
1100 target[tag] = string_buffer;
1101 break;
1102 }
1103 case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
1104 {
1105 std::vector<std::byte> byte_array;
1106 std::byte value;
1107 while (!is_char<'\0'>(*it))
1108 {
1109 string_buffer.clear();
1110 string_buffer.push_back(*it);
1111 ++it;
1112
1113 if (*it == '\0')
1114 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1115
1116 string_buffer.push_back(*it);
1117 ++it;
1118 read_byte_field(string_buffer, value);
1119 byte_array.push_back(value);
1120 }
1121 ++it; // skip \0
1122 target[tag] = byte_array;
1123 break;
1124 }
1125 case 'B': // Array. Value type depends on second char [cCsSiIf]
1126 {
1127 char array_value_type_id = *it;
1128 ++it; // skip char read before
1129
1130 switch (array_value_type_id)
1131 {
1132 case 'c': // int8_t
1133 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1134 break;
1135 case 'C': // uint8_t
1136 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1137 break;
1138 case 's': // int16_t
1139 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1140 break;
1141 case 'S': // uint16_t
1142 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1143 break;
1144 case 'i': // int32_t
1145 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1146 break;
1147 case 'I': // uint32_t
1148 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1149 break;
1150 case 'f': // float
1151 read_sam_dict_vector(target[tag], stream_view, float{});
1152 break;
1153 default:
1154 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1155 "must be one of [cCsSiIf] but '",
1156 array_value_type_id,
1157 "' was given.")};
1158 }
1159 break;
1160 }
1161 default:
1162 throw format_error{detail::to_string("The second character in the numerical id of a "
1163 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1164 type_id,
1165 "' was given.")};
1166 }
1167}
1168
1183template <typename cigar_input_type>
1184inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1185{
1186 std::vector<cigar> operations{};
1187 char operation{'\0'};
1188 uint32_t count{};
1189 int32_t ref_length{}, seq_length{};
1190 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1191 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1192 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1193
1194 if (n_cigar_op == 0) // [[unlikely]]
1195 return std::tuple{operations, ref_length, seq_length};
1196
1197 // parse the rest of the cigar
1198 // -------------------------------------------------------------------------------------------------------------
1199 while (n_cigar_op > 0) // until stream is not empty
1200 {
1202 sizeof(operation_and_count),
1203 reinterpret_cast<char *>(&operation_and_count));
1204 operation = cigar_mapping[operation_and_count & cigar_mask];
1205 count = operation_and_count >> 4;
1206
1207 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1208 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1209 --n_cigar_op;
1210 }
1211
1212 return std::tuple{operations, ref_length, seq_length};
1213}
1214
1218inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1219{
1220 std::string result{};
1221
1222 auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1223 {
1224 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1225 using T = std::remove_cvref_t<decltype(arg)>;
1226
1227 if constexpr (std::same_as<T, int32_t>)
1228 {
1229 // always choose the smallest possible representation [cCsSiI]
1230 size_t const absolute_arg = std::abs(arg);
1231 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1232 bool const negative = arg < 0;
1233 n = n * n + 2 * negative; // for switch case order
1234
1235 switch (n)
1236 {
1237 case 0:
1238 {
1239 result[result.size() - 1] = 'C';
1240 result.append(reinterpret_cast<char const *>(&arg), 1);
1241 break;
1242 }
1243 case 1:
1244 {
1245 result[result.size() - 1] = 'S';
1246 result.append(reinterpret_cast<char const *>(&arg), 2);
1247 break;
1248 }
1249 case 2:
1250 {
1251 result[result.size() - 1] = 'c';
1252 int8_t tmp = static_cast<int8_t>(arg);
1253 result.append(reinterpret_cast<char const *>(&tmp), 1);
1254 break;
1255 }
1256 case 3:
1257 {
1258 result[result.size() - 1] = 's';
1259 int16_t tmp = static_cast<int16_t>(arg);
1260 result.append(reinterpret_cast<char const *>(&tmp), 2);
1261 break;
1262 }
1263 default:
1264 {
1265 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1266 break;
1267 }
1268 }
1269 }
1270 else if constexpr (std::same_as<T, std::string>)
1271 {
1272 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1273 }
1274 else if constexpr (!std::ranges::range<T>) // char, float
1275 {
1276 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1277 }
1278 else // std::vector of some arithmetic_type type
1279 {
1280 int32_t sz{static_cast<int32_t>(arg.size())};
1281 result.append(reinterpret_cast<char *>(&sz), 4);
1282 result.append(reinterpret_cast<char const *>(arg.data()),
1283 arg.size() * sizeof(std::ranges::range_value_t<T>));
1284 }
1285 };
1286
1287 for (auto & [tag, variant] : tag_dict)
1288 {
1289 result.push_back(static_cast<char>(tag / 256));
1290 result.push_back(static_cast<char>(tag % 256));
1291
1292 result.push_back(detail::sam_tag_type_char[variant.index()]);
1293
1294 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1295 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1296
1297 std::visit(stream_variant_fn, variant);
1298 }
1299
1300 return result;
1301}
1302
1303} // namespace seqan3
T begin(T... args)
T bit_ceil(T... args)
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: alphabet/cigar/cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: alphabet/cigar/cigar.hpp:98
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:50
format_bam()=default
Defaulted.
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:647
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:264
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:66
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:179
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
T clear(T... args)
T copy(T... args)
T copy_n(T... args)
T countr_zero(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T end(T... args)
T equal(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:470
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)