]>
Commit | Line | Data |
---|---|---|
dfeec247 XL |
1 | /*! |
2 | Utilities for working with I/O using byte strings. | |
3 | ||
4 | This module currently only exports a single trait, `BufReadExt`, which provides | |
5 | facilities for conveniently and efficiently working with lines as byte strings. | |
6 | ||
7 | More APIs may be added in the future. | |
8 | */ | |
9 | ||
10 | use std::io; | |
11 | ||
f035d41b XL |
12 | use ext_slice::ByteSlice; |
13 | use ext_vec::ByteVec; | |
dfeec247 XL |
14 | |
15 | /// An extention trait for | |
16 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) | |
17 | /// which provides convenience APIs for dealing with byte strings. | |
18 | pub trait BufReadExt: io::BufRead { | |
19 | /// Returns an iterator over the lines of this reader, where each line | |
20 | /// is represented as a byte string. | |
21 | /// | |
f035d41b | 22 | /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where |
dfeec247 XL |
23 | /// an error is yielded if there was a problem reading from the underlying |
24 | /// reader. | |
25 | /// | |
26 | /// On success, the next line in the iterator is returned. The line does | |
27 | /// *not* contain a trailing `\n` or `\r\n`. | |
28 | /// | |
29 | /// # Examples | |
30 | /// | |
31 | /// Basic usage: | |
32 | /// | |
33 | /// ``` | |
34 | /// use std::io; | |
35 | /// | |
36 | /// use bstr::io::BufReadExt; | |
37 | /// | |
38 | /// # fn example() -> Result<(), io::Error> { | |
39 | /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); | |
40 | /// | |
41 | /// let mut lines = vec![]; | |
42 | /// for result in cursor.byte_lines() { | |
43 | /// let line = result?; | |
44 | /// lines.push(line); | |
45 | /// } | |
46 | /// assert_eq!(lines.len(), 3); | |
f035d41b XL |
47 | /// assert_eq!(lines[0], "lorem".as_bytes()); |
48 | /// assert_eq!(lines[1], "ipsum".as_bytes()); | |
49 | /// assert_eq!(lines[2], "dolor".as_bytes()); | |
dfeec247 XL |
50 | /// # Ok(()) }; example().unwrap() |
51 | /// ``` | |
f035d41b XL |
52 | fn byte_lines(self) -> ByteLines<Self> |
53 | where | |
54 | Self: Sized, | |
55 | { | |
dfeec247 XL |
56 | ByteLines { buf: self } |
57 | } | |
58 | ||
f035d41b XL |
59 | /// Returns an iterator over byte-terminated records of this reader, where |
60 | /// each record is represented as a byte string. | |
61 | /// | |
62 | /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where | |
63 | /// an error is yielded if there was a problem reading from the underlying | |
64 | /// reader. | |
65 | /// | |
66 | /// On success, the next record in the iterator is returned. The record | |
67 | /// does *not* contain its trailing terminator. | |
68 | /// | |
69 | /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in | |
70 | /// that it has no special handling for `\r`. | |
71 | /// | |
72 | /// # Examples | |
73 | /// | |
74 | /// Basic usage: | |
75 | /// | |
76 | /// ``` | |
77 | /// use std::io; | |
78 | /// | |
79 | /// use bstr::io::BufReadExt; | |
80 | /// | |
81 | /// # fn example() -> Result<(), io::Error> { | |
82 | /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); | |
83 | /// | |
84 | /// let mut records = vec![]; | |
85 | /// for result in cursor.byte_records(b'\x00') { | |
86 | /// let record = result?; | |
87 | /// records.push(record); | |
88 | /// } | |
89 | /// assert_eq!(records.len(), 3); | |
90 | /// assert_eq!(records[0], "lorem".as_bytes()); | |
91 | /// assert_eq!(records[1], "ipsum".as_bytes()); | |
92 | /// assert_eq!(records[2], "dolor".as_bytes()); | |
93 | /// # Ok(()) }; example().unwrap() | |
94 | /// ``` | |
95 | fn byte_records(self, terminator: u8) -> ByteRecords<Self> | |
96 | where | |
97 | Self: Sized, | |
98 | { | |
99 | ByteRecords { terminator, buf: self } | |
100 | } | |
101 | ||
dfeec247 XL |
102 | /// Executes the given closure on each line in the underlying reader. |
103 | /// | |
104 | /// If the closure returns an error (or if the underlying reader returns an | |
105 | /// error), then iteration is stopped and the error is returned. If false | |
106 | /// is returned, then iteration is stopped and no error is returned. | |
107 | /// | |
108 | /// The closure given is called on exactly the same values as yielded by | |
109 | /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines) | |
110 | /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes. | |
111 | /// | |
112 | /// This routine is useful for iterating over lines as quickly as | |
113 | /// possible. Namely, a single allocation is reused for each line. | |
114 | /// | |
115 | /// # Examples | |
116 | /// | |
117 | /// Basic usage: | |
118 | /// | |
119 | /// ``` | |
120 | /// use std::io; | |
121 | /// | |
122 | /// use bstr::io::BufReadExt; | |
123 | /// | |
124 | /// # fn example() -> Result<(), io::Error> { | |
125 | /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); | |
126 | /// | |
127 | /// let mut lines = vec![]; | |
128 | /// cursor.for_byte_line(|line| { | |
f035d41b | 129 | /// lines.push(line.to_vec()); |
dfeec247 XL |
130 | /// Ok(true) |
131 | /// })?; | |
132 | /// assert_eq!(lines.len(), 3); | |
f035d41b XL |
133 | /// assert_eq!(lines[0], "lorem".as_bytes()); |
134 | /// assert_eq!(lines[1], "ipsum".as_bytes()); | |
135 | /// assert_eq!(lines[2], "dolor".as_bytes()); | |
dfeec247 XL |
136 | /// # Ok(()) }; example().unwrap() |
137 | /// ``` | |
f035d41b XL |
138 | fn for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()> |
139 | where | |
140 | Self: Sized, | |
141 | F: FnMut(&[u8]) -> io::Result<bool>, | |
142 | { | |
143 | self.for_byte_line_with_terminator(|line| { | |
144 | for_each_line(&trim_line_slice(&line)) | |
145 | }) | |
146 | } | |
147 | ||
148 | /// Executes the given closure on each byte-terminated record in the | |
149 | /// underlying reader. | |
150 | /// | |
151 | /// If the closure returns an error (or if the underlying reader returns an | |
152 | /// error), then iteration is stopped and the error is returned. If false | |
153 | /// is returned, then iteration is stopped and no error is returned. | |
154 | /// | |
155 | /// The closure given is called on exactly the same values as yielded by | |
156 | /// the [`byte_records`](trait.BufReadExt.html#method.byte_records) | |
157 | /// iterator. Namely, records do _not_ contain a trailing terminator byte. | |
158 | /// | |
159 | /// This routine is useful for iterating over records as quickly as | |
160 | /// possible. Namely, a single allocation is reused for each record. | |
161 | /// | |
162 | /// # Examples | |
163 | /// | |
164 | /// Basic usage: | |
165 | /// | |
166 | /// ``` | |
167 | /// use std::io; | |
168 | /// | |
169 | /// use bstr::io::BufReadExt; | |
170 | /// | |
171 | /// # fn example() -> Result<(), io::Error> { | |
172 | /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); | |
173 | /// | |
174 | /// let mut records = vec![]; | |
175 | /// cursor.for_byte_record(b'\x00', |record| { | |
176 | /// records.push(record.to_vec()); | |
177 | /// Ok(true) | |
178 | /// })?; | |
179 | /// assert_eq!(records.len(), 3); | |
180 | /// assert_eq!(records[0], "lorem".as_bytes()); | |
181 | /// assert_eq!(records[1], "ipsum".as_bytes()); | |
182 | /// assert_eq!(records[2], "dolor".as_bytes()); | |
183 | /// # Ok(()) }; example().unwrap() | |
184 | /// ``` | |
185 | fn for_byte_record<F>( | |
186 | self, | |
187 | terminator: u8, | |
188 | mut for_each_record: F, | |
dfeec247 | 189 | ) -> io::Result<()> |
f035d41b XL |
190 | where |
191 | Self: Sized, | |
192 | F: FnMut(&[u8]) -> io::Result<bool>, | |
dfeec247 | 193 | { |
f035d41b XL |
194 | self.for_byte_record_with_terminator(terminator, |chunk| { |
195 | for_each_record(&trim_record_slice(&chunk, terminator)) | |
196 | }) | |
dfeec247 XL |
197 | } |
198 | ||
199 | /// Executes the given closure on each line in the underlying reader. | |
200 | /// | |
201 | /// If the closure returns an error (or if the underlying reader returns an | |
202 | /// error), then iteration is stopped and the error is returned. If false | |
203 | /// is returned, then iteration is stopped and no error is returned. | |
204 | /// | |
205 | /// Unlike | |
206 | /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line), | |
207 | /// the lines given to the closure *do* include the line terminator, if one | |
208 | /// exists. | |
209 | /// | |
210 | /// This routine is useful for iterating over lines as quickly as | |
211 | /// possible. Namely, a single allocation is reused for each line. | |
212 | /// | |
f035d41b XL |
213 | /// This is identical to `for_byte_record_with_terminator` with a |
214 | /// terminator of `\n`. | |
215 | /// | |
dfeec247 XL |
216 | /// # Examples |
217 | /// | |
218 | /// Basic usage: | |
219 | /// | |
220 | /// ``` | |
221 | /// use std::io; | |
222 | /// | |
223 | /// use bstr::io::BufReadExt; | |
224 | /// | |
225 | /// # fn example() -> Result<(), io::Error> { | |
226 | /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); | |
227 | /// | |
228 | /// let mut lines = vec![]; | |
229 | /// cursor.for_byte_line_with_terminator(|line| { | |
f035d41b | 230 | /// lines.push(line.to_vec()); |
dfeec247 XL |
231 | /// Ok(true) |
232 | /// })?; | |
233 | /// assert_eq!(lines.len(), 3); | |
f035d41b XL |
234 | /// assert_eq!(lines[0], "lorem\n".as_bytes()); |
235 | /// assert_eq!(lines[1], "ipsum\r\n".as_bytes()); | |
236 | /// assert_eq!(lines[2], "dolor".as_bytes()); | |
dfeec247 XL |
237 | /// # Ok(()) }; example().unwrap() |
238 | /// ``` | |
239 | fn for_byte_line_with_terminator<F>( | |
f035d41b XL |
240 | self, |
241 | for_each_line: F, | |
242 | ) -> io::Result<()> | |
243 | where | |
244 | Self: Sized, | |
245 | F: FnMut(&[u8]) -> io::Result<bool>, | |
246 | { | |
247 | self.for_byte_record_with_terminator(b'\n', for_each_line) | |
248 | } | |
249 | ||
250 | /// Executes the given closure on each byte-terminated record in the | |
251 | /// underlying reader. | |
252 | /// | |
253 | /// If the closure returns an error (or if the underlying reader returns an | |
254 | /// error), then iteration is stopped and the error is returned. If false | |
255 | /// is returned, then iteration is stopped and no error is returned. | |
256 | /// | |
257 | /// Unlike | |
258 | /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record), | |
259 | /// the lines given to the closure *do* include the record terminator, if | |
260 | /// one exists. | |
261 | /// | |
262 | /// This routine is useful for iterating over records as quickly as | |
263 | /// possible. Namely, a single allocation is reused for each record. | |
264 | /// | |
265 | /// # Examples | |
266 | /// | |
267 | /// Basic usage: | |
268 | /// | |
269 | /// ``` | |
270 | /// use std::io; | |
271 | /// | |
272 | /// use bstr::B; | |
273 | /// use bstr::io::BufReadExt; | |
274 | /// | |
275 | /// # fn example() -> Result<(), io::Error> { | |
276 | /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); | |
277 | /// | |
278 | /// let mut records = vec![]; | |
279 | /// cursor.for_byte_record_with_terminator(b'\x00', |record| { | |
280 | /// records.push(record.to_vec()); | |
281 | /// Ok(true) | |
282 | /// })?; | |
283 | /// assert_eq!(records.len(), 3); | |
284 | /// assert_eq!(records[0], B(b"lorem\x00")); | |
285 | /// assert_eq!(records[1], B("ipsum\x00")); | |
286 | /// assert_eq!(records[2], B("dolor")); | |
287 | /// # Ok(()) }; example().unwrap() | |
288 | /// ``` | |
289 | fn for_byte_record_with_terminator<F>( | |
dfeec247 | 290 | mut self, |
f035d41b XL |
291 | terminator: u8, |
292 | mut for_each_record: F, | |
dfeec247 | 293 | ) -> io::Result<()> |
f035d41b XL |
294 | where |
295 | Self: Sized, | |
296 | F: FnMut(&[u8]) -> io::Result<bool>, | |
dfeec247 | 297 | { |
f035d41b XL |
298 | let mut bytes = vec![]; |
299 | let mut res = Ok(()); | |
300 | let mut consumed = 0; | |
301 | 'outer: loop { | |
302 | // Lend out complete record slices from our buffer | |
303 | { | |
304 | let mut buf = self.fill_buf()?; | |
305 | while let Some(index) = buf.find_byte(terminator) { | |
306 | let (record, rest) = buf.split_at(index + 1); | |
307 | buf = rest; | |
308 | consumed += record.len(); | |
309 | match for_each_record(&record) { | |
310 | Ok(false) => break 'outer, | |
311 | Err(err) => { | |
312 | res = Err(err); | |
313 | break 'outer; | |
314 | } | |
315 | _ => (), | |
316 | } | |
317 | } | |
318 | ||
319 | // Copy the final record fragment to our local buffer. This | |
320 | // saves read_until() from re-scanning a buffer we know | |
321 | // contains no remaining terminators. | |
322 | bytes.extend_from_slice(&buf); | |
323 | consumed += buf.len(); | |
324 | } | |
325 | ||
326 | self.consume(consumed); | |
327 | consumed = 0; | |
328 | ||
329 | // N.B. read_until uses a different version of memchr that may | |
330 | // be slower than the memchr crate that bstr uses. However, this | |
331 | // should only run for a fairly small number of records, assuming a | |
332 | // decent buffer size. | |
333 | self.read_until(terminator, &mut bytes)?; | |
334 | if bytes.is_empty() || !for_each_record(&bytes)? { | |
dfeec247 XL |
335 | break; |
336 | } | |
337 | bytes.clear(); | |
338 | } | |
f035d41b XL |
339 | self.consume(consumed); |
340 | res | |
dfeec247 XL |
341 | } |
342 | } | |
343 | ||
344 | impl<B: io::BufRead> BufReadExt for B {} | |
345 | ||
346 | /// An iterator over lines from an instance of | |
347 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). | |
348 | /// | |
349 | /// This iterator is generally created by calling the | |
350 | /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines) | |
351 | /// method on the | |
352 | /// [`BufReadExt`](trait.BufReadExt.html) | |
353 | /// trait. | |
354 | #[derive(Debug)] | |
355 | pub struct ByteLines<B> { | |
356 | buf: B, | |
357 | } | |
358 | ||
f035d41b XL |
359 | /// An iterator over records from an instance of |
360 | /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html). | |
361 | /// | |
362 | /// A byte record is any sequence of bytes terminated by a particular byte | |
363 | /// chosen by the caller. For example, NUL separated byte strings are said to | |
364 | /// be NUL-terminated byte records. | |
365 | /// | |
366 | /// This iterator is generally created by calling the | |
367 | /// [`byte_records`](trait.BufReadExt.html#method.byte_records) | |
368 | /// method on the | |
369 | /// [`BufReadExt`](trait.BufReadExt.html) | |
370 | /// trait. | |
371 | #[derive(Debug)] | |
372 | pub struct ByteRecords<B> { | |
373 | buf: B, | |
374 | terminator: u8, | |
375 | } | |
376 | ||
dfeec247 | 377 | impl<B: io::BufRead> Iterator for ByteLines<B> { |
f035d41b | 378 | type Item = io::Result<Vec<u8>>; |
dfeec247 | 379 | |
f035d41b XL |
380 | fn next(&mut self) -> Option<io::Result<Vec<u8>>> { |
381 | let mut bytes = vec![]; | |
382 | match self.buf.read_until(b'\n', &mut bytes) { | |
dfeec247 XL |
383 | Err(e) => Some(Err(e)), |
384 | Ok(0) => None, | |
385 | Ok(_) => { | |
386 | trim_line(&mut bytes); | |
387 | Some(Ok(bytes)) | |
388 | } | |
389 | } | |
390 | } | |
391 | } | |
392 | ||
f035d41b XL |
393 | impl<B: io::BufRead> Iterator for ByteRecords<B> { |
394 | type Item = io::Result<Vec<u8>>; | |
395 | ||
396 | fn next(&mut self) -> Option<io::Result<Vec<u8>>> { | |
397 | let mut bytes = vec![]; | |
398 | match self.buf.read_until(self.terminator, &mut bytes) { | |
399 | Err(e) => Some(Err(e)), | |
400 | Ok(0) => None, | |
401 | Ok(_) => { | |
402 | trim_record(&mut bytes, self.terminator); | |
403 | Some(Ok(bytes)) | |
404 | } | |
405 | } | |
406 | } | |
407 | } | |
408 | ||
409 | fn trim_line(line: &mut Vec<u8>) { | |
410 | if line.last_byte() == Some(b'\n') { | |
dfeec247 | 411 | line.pop_byte(); |
f035d41b | 412 | if line.last_byte() == Some(b'\r') { |
dfeec247 XL |
413 | line.pop_byte(); |
414 | } | |
415 | } | |
416 | } | |
f035d41b XL |
417 | |
418 | fn trim_line_slice(mut line: &[u8]) -> &[u8] { | |
419 | if line.last_byte() == Some(b'\n') { | |
420 | line = &line[..line.len() - 1]; | |
421 | if line.last_byte() == Some(b'\r') { | |
422 | line = &line[..line.len() - 1]; | |
423 | } | |
424 | } | |
425 | line | |
426 | } | |
427 | ||
428 | fn trim_record(record: &mut Vec<u8>, terminator: u8) { | |
429 | if record.last_byte() == Some(terminator) { | |
430 | record.pop_byte(); | |
431 | } | |
432 | } | |
433 | ||
434 | fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] { | |
435 | if record.last_byte() == Some(terminator) { | |
436 | record = &record[..record.len() - 1]; | |
437 | } | |
438 | record | |
439 | } | |
440 | ||
441 | #[cfg(test)] | |
442 | mod tests { | |
443 | use super::BufReadExt; | |
444 | use bstring::BString; | |
445 | ||
446 | fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { | |
447 | let mut lines = vec![]; | |
448 | slice | |
449 | .as_ref() | |
450 | .for_byte_line(|line| { | |
451 | lines.push(BString::from(line.to_vec())); | |
452 | Ok(true) | |
453 | }) | |
454 | .unwrap(); | |
455 | lines | |
456 | } | |
457 | ||
458 | fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { | |
459 | let mut lines = vec![]; | |
460 | slice | |
461 | .as_ref() | |
462 | .for_byte_line_with_terminator(|line| { | |
463 | lines.push(BString::from(line.to_vec())); | |
464 | Ok(true) | |
465 | }) | |
466 | .unwrap(); | |
467 | lines | |
468 | } | |
469 | ||
470 | #[test] | |
471 | fn lines_without_terminator() { | |
472 | assert_eq!(collect_lines(""), Vec::<BString>::new()); | |
473 | ||
474 | assert_eq!(collect_lines("\n"), vec![""]); | |
475 | assert_eq!(collect_lines("\n\n"), vec!["", ""]); | |
476 | assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]); | |
477 | assert_eq!(collect_lines("a\nb"), vec!["a", "b"]); | |
478 | assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]); | |
479 | assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]); | |
480 | ||
481 | assert_eq!(collect_lines("\r\n"), vec![""]); | |
482 | assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]); | |
483 | assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]); | |
484 | assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]); | |
485 | assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]); | |
486 | assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]); | |
487 | ||
488 | assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]); | |
489 | } | |
490 | ||
491 | #[test] | |
492 | fn lines_with_terminator() { | |
493 | assert_eq!(collect_lines_term(""), Vec::<BString>::new()); | |
494 | ||
495 | assert_eq!(collect_lines_term("\n"), vec!["\n"]); | |
496 | assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]); | |
497 | assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]); | |
498 | assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]); | |
499 | assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]); | |
500 | assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]); | |
501 | ||
502 | assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]); | |
503 | assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]); | |
504 | assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]); | |
505 | assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]); | |
506 | assert_eq!( | |
507 | collect_lines_term("abc\r\nxyz\r\n"), | |
508 | vec!["abc\r\n", "xyz\r\n"] | |
509 | ); | |
510 | assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]); | |
511 | ||
512 | assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]); | |
513 | } | |
514 | } |