SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
format_fastq.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <algorithm>
16#include <iterator>
17#include <ranges>
18#include <string>
19#include <string_view>
20#include <vector>
21
44
45namespace seqan3
46{
47
80{
81public:
85 format_fastq() noexcept = default;
86 format_fastq(format_fastq const &) noexcept = default;
87 format_fastq & operator=(format_fastq const &) noexcept = default;
88 format_fastq(format_fastq &&) noexcept = default;
89 format_fastq & operator=(format_fastq &&) noexcept = default;
90 ~format_fastq() noexcept = default;
91
93
95 static inline std::vector<std::string> file_extensions{{"fastq"}, {"fq"}};
96
97protected:
99 template <typename stream_type, // constraints checked by file
100 typename seq_legal_alph_type,
101 typename stream_pos_type,
102 typename seq_type, // other constraints checked inside function
103 typename id_type,
104 typename qual_type>
105 void read_sequence_record(stream_type & stream,
107 stream_pos_type & position_buffer,
108 seq_type & sequence,
109 id_type & id,
110 qual_type & qualities)
111 {
112 auto stream_view = detail::istreambuf(stream);
113 auto stream_it = std::ranges::begin(stream_view);
114
115 // cache the begin position so we write quals to the same position as seq in seq_qual case
116 size_t sequence_size_before = 0;
117 size_t sequence_size_after = 0;
118 if constexpr (!detail::decays_to_ignore_v<seq_type>)
119 sequence_size_before = size(sequence);
120 position_buffer = stream.tellg();
121
122 /* ID */
123 if (*stream_it != '@') // [[unlikely]]
124 {
125 throw parse_error{std::string{"Expected '@' on beginning of ID line, got: "}
126 + detail::make_printable(*stream_it)};
127 }
128 ++stream_it; // skip '@'
129
130#if SEQAN3_WORKAROUND_VIEW_PERFORMANCE // can't have nice things :'(
131 auto e = std::ranges::end(stream_view);
132 if constexpr (!detail::decays_to_ignore_v<id_type>)
133 {
134 if (options.truncate_ids)
135 {
136 for (; (stream_it != e) && (!(is_cntrl || is_blank))(*stream_it); ++stream_it)
137 {
139 id.push_back(*stream_it);
140 else
141 id.push_back(assign_char_to(*stream_it, std::ranges::range_value_t<id_type>{}));
142 }
143 for (; (stream_it != e) && (!is_char<'\n'>)(*stream_it); ++stream_it)
144 {}
145 }
146 else
147 {
148 for (; (stream_it != e) && (!is_char<'\n'>)(*stream_it); ++stream_it)
149 {
151 id.push_back(*stream_it);
152 else
153 id.push_back(assign_char_to(*stream_it, std::ranges::range_value_t<id_type>{}));
154 }
155 }
156 }
157 else
158 {
159 for (; (stream_it != e) && (!is_char<'\n'>)(*stream_it); ++stream_it)
160 {}
161 }
162
163 if (stream_it == e)
164 {
165 throw unexpected_end_of_input{"Expected end of ID-line, got end-of-file."};
166 }
167 ++stream_it; // skip newline
168
169 /* Sequence */
170 if constexpr (!detail::decays_to_ignore_v<seq_type>)
171 {
172 for (; (stream_it != e) && (!is_char<'+'>)(*stream_it); ++stream_it)
173 {
174 if ((!is_space)(*stream_it))
175 {
177 {
178 sequence.push_back(*stream_it);
179 }
180 else
181 {
182 if (!char_is_valid_for<seq_legal_alph_type>(*stream_it))
183 {
184 throw parse_error{std::string{"Encountered bad letter for seq: "}
185 + detail::make_printable(*stream_it)};
186 }
187 sequence.push_back(assign_char_to(*stream_it, std::ranges::range_value_t<seq_type>{}));
188 }
189 }
190 }
191 sequence_size_after = size(sequence);
192 }
193 else // consume, but count
194 {
195 for (; (stream_it != e) && (!is_char<'+'>)(*stream_it); ++stream_it)
196 if ((!is_space)(*stream_it))
197 ++sequence_size_after;
198 }
199
200 /* 2nd ID line */
201 if (stream_it == e)
202 throw unexpected_end_of_input{"Expected second ID-line, got end-of-file."};
203
204 if (*stream_it != '+')
205 {
206 throw parse_error{std::string{"Expected '+' on beginning of 2nd ID line, got: "}
207 + detail::make_printable(*stream_it)};
208 }
209
210 for (; (stream_it != e) && (!is_char<'\n'>)(*stream_it); ++stream_it)
211 {}
212
213 if (stream_it == e)
214 throw unexpected_end_of_input{"Expected end of second ID-line, got end-of-file."};
215
216 ++stream_it;
217
218 /* Qualities */
219 if constexpr (!detail::decays_to_ignore_v<qual_type>)
220 {
221 while (sequence_size_after > sequence_size_before)
222 {
223 if (stream_it == e)
224 throw unexpected_end_of_input{"Expected qualities, got end-of-file."};
225
226 if ((!is_space)(*stream_it))
227 {
228 --sequence_size_after;
230 {
231 qualities.push_back(*stream_it);
232 }
233 else
234 {
235 if (!char_is_valid_for<std::ranges::range_value_t<qual_type>>(*stream_it))
236 {
237 throw parse_error{std::string{"Encountered bad letter for qual: "}
238 + detail::make_printable(*stream_it)};
239 }
240 qualities.push_back(assign_char_to(*stream_it, std::ranges::range_value_t<qual_type>{}));
241 }
242 }
243 ++stream_it;
244 }
245 }
246 else // consume
247 {
248 while (sequence_size_after > sequence_size_before)
249 {
250 if (stream_it == e)
251 throw unexpected_end_of_input{"File ended before expected number of qualities could be read."};
252
253 if ((!is_space)(*stream_it))
254 --sequence_size_after;
255 ++stream_it;
256 }
257 }
258
259 if (stream_it != e)
260 {
261 if ((!is_char<'\n'>)(*stream_it))
262 throw parse_error{"Qualitites longer than sequence."};
263 else
264 ++stream_it;
265 }
266
267#else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
268
269 if constexpr (!detail::decays_to_ignore_v<id_type>)
270 {
271 if (options.truncate_ids)
272 {
274 | views::char_to<std::ranges::range_value_t<id_type>>,
277 }
278 else
279 {
281 | views::char_to<std::ranges::range_value_t<id_type>>,
283 }
284 }
285 else
286 {
288 }
289
290 /* Sequence */
291 auto seq_view = stream_view | detail::take_until_or_throw(is_char<'+'>) // until 2nd ID line
292 | std::views::filter(!is_space); // ignore whitespace
293 if constexpr (!detail::decays_to_ignore_v<seq_type>)
294 {
295 constexpr auto is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
297 seq_view
299 [is_legal_alph](char const c) // enforce legal alphabet
300 {
301 if (!is_legal_alph(c))
302 {
303 throw parse_error{std::string{"Encountered an unexpected letter: "}
304 + "char_is_valid_for<"
305 + detail::type_name_as_string<seq_legal_alph_type>
306 + "> evaluated to false on " + detail::make_printable(c)};
307 }
308 return c;
309 })
310 | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
312 sequence_size_after = size(sequence);
313 }
314 else // consume, but count
315 {
316 auto it = begin(seq_view);
317 auto it_end = end(seq_view);
318 while (it != it_end)
319 {
320 ++it;
321 ++sequence_size_after;
322 }
323 }
324
326
327 /* Qualities */
328 auto qview = stream_view | std::views::filter(!is_space) // this consumes trailing newline
329 | detail::take_exactly_or_throw(sequence_size_after - sequence_size_before);
330 if constexpr (!detail::decays_to_ignore_v<qual_type>)
331 {
332 std::ranges::copy(qview | views::char_to<std::ranges::range_value_t<qual_type>>,
333 std::back_inserter(qualities));
334 }
335 else
336 {
337 detail::consume(qview);
338 }
339#endif
340 }
341
343 template <typename stream_type, // constraints checked by file
344 typename seq_type, // other constraints checked inside function
345 typename id_type,
346 typename qual_type>
347 void write_sequence_record(stream_type & stream,
348 sequence_file_output_options const & options,
349 seq_type && sequence,
350 id_type && id,
351 qual_type && qualities)
352 {
353 seqan3::detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
354
355 // ID
356 if constexpr (detail::decays_to_ignore_v<id_type>)
357 {
358 throw std::logic_error{"The ID field may not be set to ignore when writing FASTQ files."};
359 }
360 else
361 {
362 if (std::ranges::empty(id)) //[[unlikely]]
363 throw std::runtime_error{"The ID field may not be empty when writing FASTQ files."};
364
365 stream_it = '@';
366 stream_it.write_range(id);
367 stream_it.write_end_of_line(options.add_carriage_return);
368 }
369
370 // Sequence
371 if constexpr (detail::decays_to_ignore_v<seq_type>)
372 {
373 throw std::logic_error{
374 "The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
375 }
376 else
377 {
378 if (std::ranges::empty(sequence)) //[[unlikely]]
379 throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
380
381 stream_it.write_range(sequence | views::to_char);
382 stream_it.write_end_of_line(options.add_carriage_return);
383 }
384
385 // 2nd ID-line
386 if constexpr (!detail::decays_to_ignore_v<id_type>)
387 {
388 stream_it = '+';
389
390 if (options.fastq_double_id)
391 stream_it.write_range(id);
392
393 stream_it.write_end_of_line(options.add_carriage_return);
394 }
395
396 // Quality line
397 if constexpr (detail::decays_to_ignore_v<qual_type>)
398 {
399 throw std::logic_error{
400 "The QUAL and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
401 }
402 else
403 {
404 if (std::ranges::empty(qualities)) //[[unlikely]]
405 throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
406
407 if constexpr (std::ranges::sized_range<seq_type> && std::ranges::sized_range<qual_type>)
408 {
409 assert(std::ranges::size(sequence) == std::ranges::size(qualities));
410 }
411
412 stream_it.write_range(qualities | views::to_char);
413 stream_it.write_end_of_line(options.add_carriage_return);
414 }
415 }
416};
417
418} // namespace seqan3
Provides aliases for qualified.
Core alphabet concept and free function/type trait wrappers.
T back_inserter(T... args)
T begin(T... args)
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:40
The FASTQ format.
Definition: format_fastq.hpp:80
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, stream_pos_type &position_buffer, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_fastq.hpp:105
format_fastq() noexcept=default
Defaulted.
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_fastq.hpp:347
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fastq.hpp:95
T copy(T... args)
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides seqan3::dna5, container aliases and string literals.
Provides seqan3::detail::fast_ostreambuf_iterator.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:63
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:67
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: alphabet/concept.hpp:524
constexpr auto char_is_valid_for
Returns whether a character is in the valid set of a seqan3::alphabet (usually implies a bijective ma...
Definition: alphabet/concept.hpp:670
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: core/range/detail/misc.hpp:28
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly_view.hpp:590
constexpr auto take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until_view.hpp:574
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf_view.hpp:107
constexpr auto take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line_view.hpp:85
constexpr auto is_blank
Checks whether c is a blank character.
Definition: predicate.hpp:142
std::string make_printable(char const c)
Returns a printable value for the given character c.
Definition: pretty_print.hpp:48
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:125
constexpr auto is_cntrl
Checks whether c is a control character.
Definition: predicate.hpp:90
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:470
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
This concept encompasses exactly the types char, signed char, unsigned char, wchar_t,...
The generic concept for a (biological) sequence.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
T push_back(T... args)
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_input_options.
Provides seqan3::sequence_file_output_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: io/exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sequence_file/input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: sequence_file/input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sequence_file/output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: sequence_file/output_options.hpp:42
bool fastq_double_id
Whether to write the ID only '@' or also after '+' line.
Definition: sequence_file/output_options.hpp:37
Thrown if I/O was expecting more input (e.g. a delimiter or a new line), but the end of input was rea...
Definition: io/exception.hpp:78
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides concepts that do not have equivalents in C++20.