]> git.proxmox.com Git - pxar.git/blob - src/accessor.rs
Entry: add file_size() convenience helper
[pxar.git] / src / accessor.rs
1 //! Random access for PXAR files.
2
3 use std::ffi::{OsStr, OsString};
4 use std::io;
5 use std::mem::{self, size_of, size_of_val, MaybeUninit};
6 use std::ops::Range;
7 use std::os::unix::ffi::{OsStrExt, OsStringExt};
8 use std::path::{Path, PathBuf};
9 use std::pin::Pin;
10 use std::sync::Arc;
11 use std::task::{Context, Poll};
12
13 use endian_trait::Endian;
14
15 use crate::binary_tree_array;
16 use crate::decoder::{self, DecoderImpl};
17 use crate::format::{self, GoodbyeItem};
18 use crate::poll_fn::poll_fn;
19 use crate::util;
20 use crate::{Entry, EntryKind};
21
22 #[doc(hidden)]
23 pub mod aio;
24 pub mod cache;
25 pub mod sync;
26
27 #[doc(inline)]
28 pub use sync::{Accessor, DirEntry, Directory, FileEntry, ReadDir};
29
30 use cache::Cache;
31
32 /// Random access read implementation.
33 pub trait ReadAt {
34 fn poll_read_at(
35 self: Pin<&Self>,
36 cx: &mut Context,
37 buf: &mut [u8],
38 offset: u64,
39 ) -> Poll<io::Result<usize>>;
40 }
41
42 /// We do not want to bother with actual polling, so we implement `async fn` variants of the above
43 /// on `dyn ReadAt`.
44 ///
45 /// The reason why this is not an internal `ReadAtExt` trait like `AsyncReadExt` is simply that
46 /// we'd then need to define all the `Future` types they return manually and explicitly. Since we
47 /// have no use for them, all we want is the ability to use `async fn`...
48 ///
49 /// The downside is that we need some `(&mut self.input as &mut dyn ReadAt)` casts in the
50 /// decoder's code, but that's fine.
51 impl<'a> dyn ReadAt + 'a {
52 /// awaitable version of `poll_read_at`.
53 async fn read_at(&self, buf: &mut [u8], offset: u64) -> io::Result<usize> {
54 poll_fn(|cx| unsafe { Pin::new_unchecked(self).poll_read_at(cx, buf, offset) }).await
55 }
56
57 /// `read_exact_at` - since that's what we _actually_ want most of the time.
58 async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> io::Result<()> {
59 while !buf.is_empty() {
60 match self.read_at(buf, offset).await? {
61 0 => io_bail!("unexpected EOF"),
62 got => {
63 buf = &mut buf[got..];
64 offset += got as u64;
65 }
66 }
67 }
68 Ok(())
69 }
70
71 /// Helper to read into an `Endian`-implementing `struct`.
72 async fn read_entry_at<T: Endian>(&self, offset: u64) -> io::Result<T> {
73 let mut data = MaybeUninit::<T>::uninit();
74 let buf =
75 unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut u8, size_of::<T>()) };
76 self.read_exact_at(buf, offset).await?;
77 Ok(unsafe { data.assume_init().from_le() })
78 }
79
80 /// Helper to read into an allocated byte vector.
81 async fn read_exact_data_at(&self, size: usize, offset: u64) -> io::Result<Vec<u8>> {
82 let mut data = util::vec_new(size);
83 self.read_exact_at(&mut data[..], offset).await?;
84 Ok(data)
85 }
86 }
87
88 /// Allow using trait objects for `T: ReadAt`
89 impl<'a> ReadAt for &(dyn ReadAt + 'a) {
90 fn poll_read_at(
91 self: Pin<&Self>,
92 cx: &mut Context,
93 buf: &mut [u8],
94 offset: u64,
95 ) -> Poll<io::Result<usize>> {
96 unsafe {
97 Pin::new_unchecked(&**self).poll_read_at(cx, buf, offset)
98 }
99 }
100 }
101
102 #[derive(Clone)]
103 struct Caches {
104 /// The goodbye table cache maps goodbye table offsets to cache entries.
105 gbt_cache: Option<Arc<dyn Cache<u64, [GoodbyeItem]> + Send + Sync>>,
106 }
107
108 impl Default for Caches {
109 fn default() -> Self {
110 Self { gbt_cache: None }
111 }
112 }
113
114 /// The random access state machine implementation.
115 pub(crate) struct AccessorImpl<T> {
116 input: T,
117 size: u64,
118 caches: Arc<Caches>,
119 }
120
121 impl<T: ReadAt> AccessorImpl<T> {
122 pub async fn new(input: T, size: u64) -> io::Result<Self> {
123 if size < (size_of::<GoodbyeItem>() as u64) {
124 io_bail!("too small to contain a pxar archive");
125 }
126
127 Ok(Self {
128 input,
129 size,
130 caches: Arc::new(Caches::default()),
131 })
132 }
133
134 pub async fn open_root_ref<'a>(&'a self) -> io::Result<DirectoryImpl<&'a dyn ReadAt>> {
135 DirectoryImpl::open_at_end(
136 &self.input as &dyn ReadAt,
137 self.size,
138 "/".into(),
139 Arc::clone(&self.caches),
140 )
141 .await
142 }
143
144 pub fn set_goodbye_table_cache(
145 &mut self,
146 cache: Option<Arc<dyn Cache<u64, [GoodbyeItem]> + Send + Sync>>,
147 ) {
148 let new_caches = Arc::new(Caches {
149 gbt_cache: cache,
150 ..*self.caches
151 });
152 self.caches = new_caches;
153 }
154 }
155
156 impl<T: Clone + ReadAt> AccessorImpl<T> {
157 pub async fn open_root(&self) -> io::Result<DirectoryImpl<T>> {
158 DirectoryImpl::open_at_end(
159 self.input.clone(),
160 self.size,
161 "/".into(),
162 Arc::clone(&self.caches),
163 )
164 .await
165 }
166
167 /// Allow opening a directory at a specified offset.
168 pub async unsafe fn open_dir_at_end(&self, offset: u64) -> io::Result<DirectoryImpl<T>> {
169 DirectoryImpl::open_at_end(
170 self.input.clone(),
171 offset,
172 "/".into(),
173 Arc::clone(&self.caches),
174 )
175 .await
176 }
177 }
178
179 /// The directory random-access state machine implementation.
180 pub(crate) struct DirectoryImpl<T> {
181 input: T,
182 entry_ofs: u64,
183 goodbye_ofs: u64,
184 size: u64,
185 table: Arc<[GoodbyeItem]>,
186 path: PathBuf,
187 caches: Arc<Caches>,
188 }
189
190 impl<T: Clone + ReadAt> DirectoryImpl<T> {
191 /// Open a directory ending at the specified position.
192 async fn open_at_end(
193 input: T,
194 end_offset: u64,
195 path: PathBuf,
196 caches: Arc<Caches>,
197 ) -> io::Result<DirectoryImpl<T>> {
198 let tail = Self::read_tail_entry(&input, end_offset).await?;
199
200 if end_offset < tail.size {
201 io_bail!("goodbye tail size out of range");
202 }
203
204 let goodbye_ofs = end_offset - tail.size;
205
206 if goodbye_ofs < tail.offset {
207 io_bail!("goodbye offset out of range");
208 }
209
210 let entry_ofs = goodbye_ofs - tail.offset;
211 let size = end_offset - entry_ofs;
212
213 let table: Option<Arc<[GoodbyeItem]>> = caches
214 .gbt_cache
215 .as_ref()
216 .and_then(|cache| cache.fetch(goodbye_ofs));
217
218 let mut this = Self {
219 input,
220 entry_ofs,
221 goodbye_ofs,
222 size,
223 table: table.as_ref().map_or_else(|| Arc::new([]), Arc::clone),
224 path,
225 caches,
226 };
227
228 // sanity check:
229 if this.table_size() % (size_of::<GoodbyeItem>() as u64) != 0 {
230 io_bail!("invalid goodbye table size: {}", this.table_size());
231 }
232
233 if table.is_none() {
234 this.table = this.load_table().await?;
235 if let Some(ref cache) = this.caches.gbt_cache {
236 cache.insert(goodbye_ofs, Arc::clone(&this.table));
237 }
238 }
239
240 Ok(this)
241 }
242
243 /// Load the entire goodbye table:
244 async fn load_table(&self) -> io::Result<Arc<[GoodbyeItem]>> {
245 let len = self.len();
246 let mut data = Vec::with_capacity(self.len());
247 unsafe {
248 data.set_len(len);
249 let slice = std::slice::from_raw_parts_mut(
250 data.as_mut_ptr() as *mut u8,
251 len * size_of::<GoodbyeItem>(),
252 );
253 (&self.input as &dyn ReadAt)
254 .read_exact_at(slice, self.table_offset())
255 .await?;
256 drop(slice);
257 }
258 Ok(Arc::from(data))
259 }
260
261 #[inline]
262 fn end_offset(&self) -> u64 {
263 self.entry_ofs + self.size
264 }
265
266 #[inline]
267 fn entry_range(&self) -> Range<u64> {
268 self.entry_ofs..self.end_offset()
269 }
270
271 #[inline]
272 fn table_size(&self) -> u64 {
273 (self.end_offset() - self.goodbye_ofs) - (size_of::<format::Header>() as u64)
274 }
275
276 #[inline]
277 fn table_offset(&self) -> u64 {
278 self.goodbye_ofs + (size_of::<format::Header>() as u64)
279 }
280
281 /// Length *excluding* the tail marker!
282 #[inline]
283 fn len(&self) -> usize {
284 (self.table_size() / (size_of::<GoodbyeItem>() as u64)) as usize - 1
285 }
286
287 /// Read the goodbye tail and perform some sanity checks.
288 async fn read_tail_entry(input: &'_ dyn ReadAt, end_offset: u64) -> io::Result<GoodbyeItem> {
289 if end_offset < (size_of::<GoodbyeItem>() as u64) {
290 io_bail!("goodbye tail does not fit");
291 }
292
293 let tail_offset = end_offset - (size_of::<GoodbyeItem>() as u64);
294 let tail: GoodbyeItem = input.read_entry_at(tail_offset).await?;
295
296 if tail.hash != format::PXAR_GOODBYE_TAIL_MARKER {
297 io_bail!("no goodbye tail marker found");
298 }
299
300 Ok(tail)
301 }
302
303 /// Get a decoder for the directory contents.
304 pub(crate) async fn decode_full(&self) -> io::Result<DecoderImpl<SeqReadAtAdapter<T>>> {
305 let (dir, decoder) = self.decode_one_entry(self.entry_range(), None).await?;
306 if !dir.is_dir() {
307 io_bail!("directory does not seem to be a directory");
308 }
309 Ok(decoder)
310 }
311
312 async fn get_decoder(
313 &self,
314 entry_range: Range<u64>,
315 file_name: Option<&Path>,
316 ) -> io::Result<DecoderImpl<SeqReadAtAdapter<T>>> {
317 Ok(DecoderImpl::new_full(
318 SeqReadAtAdapter::new(self.input.clone(), entry_range),
319 match file_name {
320 None => self.path.clone(),
321 Some(file) => self.path.join(file),
322 },
323 )
324 .await?)
325 }
326
327 async fn decode_one_entry(
328 &self,
329 entry_range: Range<u64>,
330 file_name: Option<&Path>,
331 ) -> io::Result<(Entry, DecoderImpl<SeqReadAtAdapter<T>>)> {
332 let mut decoder = self.get_decoder(entry_range, file_name).await?;
333 let entry = decoder
334 .next()
335 .await
336 .ok_or_else(|| io_format_err!("unexpected EOF while decoding directory entry"))??;
337 Ok((entry, decoder))
338 }
339
340 fn lookup_hash_position(&self, hash: u64, start: usize, skip: usize) -> Option<usize> {
341 binary_tree_array::search_by(&self.table, start, skip, |i| hash.cmp(&i.hash))
342 }
343
344 async fn lookup_self(&self) -> io::Result<FileEntryImpl<T>> {
345 let (entry, _decoder) = self.decode_one_entry(self.entry_range(), None).await?;
346 Ok(FileEntryImpl {
347 input: self.input.clone(),
348 entry,
349 entry_range: self.entry_range(),
350 caches: Arc::clone(&self.caches),
351 })
352 }
353
354 /// Lookup a directory entry.
355 pub async fn lookup(&self, path: &Path) -> io::Result<Option<FileEntryImpl<T>>> {
356 let mut cur: Option<FileEntryImpl<T>> = None;
357
358 let mut first = true;
359 for component in path.components() {
360 use std::path::Component;
361
362 let first = mem::replace(&mut first, false);
363
364 let component = match component {
365 Component::Normal(path) => path,
366 Component::ParentDir => io_bail!("cannot enter parent directory in archive"),
367 Component::RootDir | Component::CurDir if first => {
368 cur = Some(self.lookup_self().await?);
369 continue;
370 }
371 Component::CurDir => continue,
372 _ => io_bail!("invalid component in path"),
373 };
374
375 let next = match cur {
376 Some(entry) => {
377 entry
378 .enter_directory()
379 .await?
380 .lookup_component(component)
381 .await?
382 }
383 None => self.lookup_component(component).await?,
384 };
385
386 if next.is_none() {
387 return Ok(None);
388 }
389
390 cur = next;
391 }
392
393 Ok(cur)
394 }
395
396 /// Lookup a single directory entry component (does not handle multiple components in path)
397 pub async fn lookup_component(&self, path: &OsStr) -> io::Result<Option<FileEntryImpl<T>>> {
398 let hash = format::hash_filename(path.as_bytes());
399 let first_index = match self.lookup_hash_position(hash, 0, 0) {
400 Some(index) => index,
401 None => return Ok(None),
402 };
403
404 // Lookup FILENAME, if the hash matches but the filename doesn't, check for a duplicate
405 // hash once found, use the GoodbyeItem's offset+size as well as the file's Entry to return
406 // a DirEntry::Dir or Dir::Entry.
407 //
408 let mut dup = 0;
409 loop {
410 let index = match self.lookup_hash_position(hash, first_index, dup) {
411 Some(index) => index,
412 None => return Ok(None),
413 };
414
415 let cursor = self.get_cursor(index).await?;
416 if cursor.file_name == path {
417 return Ok(Some(cursor.decode_entry().await?));
418 }
419
420 dup += 1;
421 }
422 }
423
424 async fn get_cursor<'a>(&'a self, index: usize) -> io::Result<DirEntryImpl<'a, T>> {
425 let entry = &self.table[index];
426 let file_goodbye_ofs = entry.offset;
427 if self.goodbye_ofs < file_goodbye_ofs {
428 io_bail!("invalid file offset");
429 }
430
431 let file_ofs = self.goodbye_ofs - file_goodbye_ofs;
432 let (file_name, entry_ofs) = self.read_filename_entry(file_ofs).await?;
433
434 let entry_range = Range {
435 start: entry_ofs,
436 end: file_ofs + entry.size,
437 };
438 if entry_range.end < entry_range.start {
439 io_bail!(
440 "bad file: invalid entry ranges for {:?}: \
441 start=0x{:x}, file_ofs=0x{:x}, size=0x{:x}",
442 file_name,
443 entry_ofs,
444 file_ofs,
445 entry.size,
446 );
447 }
448
449 Ok(DirEntryImpl {
450 dir: self,
451 file_name,
452 entry_range,
453 caches: Arc::clone(&self.caches),
454 })
455 }
456
457 async fn read_filename_entry(&self, file_ofs: u64) -> io::Result<(PathBuf, u64)> {
458 let head: format::Header = (&self.input as &dyn ReadAt).read_entry_at(file_ofs).await?;
459 if head.htype != format::PXAR_FILENAME {
460 io_bail!("expected PXAR_FILENAME header, found: {:x}", head.htype);
461 }
462
463 let mut path = (&self.input as &dyn ReadAt)
464 .read_exact_data_at(
465 head.content_size() as usize,
466 file_ofs + (size_of_val(&head) as u64),
467 )
468 .await?;
469
470 if path.pop() != Some(0) {
471 io_bail!("invalid file name (missing terminating zero)");
472 }
473
474 if path.is_empty() {
475 io_bail!("invalid empty file name");
476 }
477
478 let file_name = PathBuf::from(OsString::from_vec(path));
479 format::check_file_name(&file_name)?;
480
481 Ok((file_name, file_ofs + head.full_size()))
482 }
483
484 pub fn read_dir(&self) -> ReadDirImpl<T> {
485 ReadDirImpl::new(self, 0)
486 }
487 }
488
489 /// A file entry retrieved from a Directory.
490 pub(crate) struct FileEntryImpl<T: Clone + ReadAt> {
491 input: T,
492 entry: Entry,
493 entry_range: Range<u64>,
494 caches: Arc<Caches>,
495 }
496
497 impl<T: Clone + ReadAt> FileEntryImpl<T> {
498 pub async fn enter_directory(&self) -> io::Result<DirectoryImpl<T>> {
499 if !self.entry.is_dir() {
500 io_bail!("enter_directory() on a non-directory");
501 }
502
503 DirectoryImpl::open_at_end(
504 self.input.clone(),
505 self.entry_range.end,
506 self.entry.path.clone(),
507 Arc::clone(&self.caches),
508 )
509 .await
510 }
511
512 pub async fn contents(&self) -> io::Result<FileContentsImpl<T>> {
513 match self.entry.kind {
514 EntryKind::File { offset: None, .. } => {
515 io_bail!("cannot open file, reader provided no offset")
516 }
517 EntryKind::File {
518 size,
519 offset: Some(offset),
520 } => Ok(FileContentsImpl::new(
521 self.input.clone(),
522 offset..(offset + size),
523 )),
524 _ => io_bail!("not a file"),
525 }
526 }
527
528 #[inline]
529 pub fn into_entry(self) -> Entry {
530 self.entry
531 }
532
533 #[inline]
534 pub fn entry(&self) -> &Entry {
535 &self.entry
536 }
537
538 /// Exposed for raw by-offset access methods (use with `open_dir_at_end`).
539 #[inline]
540 pub fn entry_range(&self) -> Range<u64> {
541 self.entry_range.clone()
542 }
543 }
544
545 /// An iterator over the contents of a directory.
546 pub(crate) struct ReadDirImpl<'a, T> {
547 dir: &'a DirectoryImpl<T>,
548 at: usize,
549 }
550
551 impl<'a, T: Clone + ReadAt> ReadDirImpl<'a, T> {
552 fn new(dir: &'a DirectoryImpl<T>, at: usize) -> Self {
553 Self { dir, at }
554 }
555
556 /// Get the next entry.
557 pub async fn next(&mut self) -> io::Result<Option<DirEntryImpl<'a, T>>> {
558 if self.at == self.dir.table.len() {
559 Ok(None)
560 } else {
561 let cursor = self.dir.get_cursor(self.at).await?;
562 self.at += 1;
563 Ok(Some(cursor))
564 }
565 }
566
567 /// Efficient alternative to `Iterator::skip`.
568 #[inline]
569 pub fn skip(self, n: usize) -> Self {
570 Self {
571 at: (self.at + n).min(self.dir.table.len()),
572 dir: self.dir,
573 }
574 }
575
576 /// Efficient alternative to `Iterator::count`.
577 #[inline]
578 pub fn count(self) -> usize {
579 self.dir.table.len()
580 }
581 }
582
583 /// A cursor pointing to a file in a directory.
584 ///
585 /// At this point only the file name has been read and we remembered the position for finding the
586 /// actual data. This can be upgraded into a FileEntryImpl.
587 pub(crate) struct DirEntryImpl<'a, T: Clone + ReadAt> {
588 dir: &'a DirectoryImpl<T>,
589 file_name: PathBuf,
590 entry_range: Range<u64>,
591 caches: Arc<Caches>,
592 }
593
594 impl<'a, T: Clone + ReadAt> DirEntryImpl<'a, T> {
595 pub fn file_name(&self) -> &Path {
596 &self.file_name
597 }
598
599 async fn decode_entry(&self) -> io::Result<FileEntryImpl<T>> {
600 let (entry, _decoder) = self
601 .dir
602 .decode_one_entry(self.entry_range.clone(), Some(&self.file_name))
603 .await?;
604
605 Ok(FileEntryImpl {
606 input: self.dir.input.clone(),
607 entry,
608 entry_range: self.entry_range(),
609 caches: Arc::clone(&self.caches),
610 })
611 }
612
613 /// Exposed for raw by-offset access methods.
614 #[inline]
615 pub fn entry_range(&self) -> Range<u64> {
616 self.entry_range.clone()
617 }
618 }
619
620 /// A reader for file contents.
621 pub(crate) struct FileContentsImpl<T> {
622 input: T,
623
624 /// Absolute offset inside the `input`.
625 range: Range<u64>,
626 }
627
628 impl<T: Clone + ReadAt> FileContentsImpl<T> {
629 pub fn new(input: T, range: Range<u64>) -> Self {
630 Self { input, range }
631 }
632
633 #[inline]
634 pub fn file_size(&self) -> u64 {
635 self.range.end - self.range.start
636 }
637
638 async fn read_at(&self, mut buf: &mut [u8], offset: u64) -> io::Result<usize> {
639 let size = self.file_size();
640 if offset >= size {
641 return Ok(0);
642 }
643 let remaining = size - offset;
644
645 if remaining < buf.len() as u64 {
646 buf = &mut buf[..(remaining as usize)];
647 }
648
649 (&self.input as &dyn ReadAt)
650 .read_at(buf, self.range.start + offset)
651 .await
652 }
653 }
654
655 #[doc(hidden)]
656 pub struct SeqReadAtAdapter<T> {
657 input: T,
658 range: Range<u64>,
659 }
660
661 impl<T: ReadAt> SeqReadAtAdapter<T> {
662 pub fn new(input: T, range: Range<u64>) -> Self {
663 if range.end < range.start {
664 panic!("BAD SEQ READ AT ADAPTER");
665 }
666 Self { input, range }
667 }
668
669 #[inline]
670 fn remaining(&self) -> usize {
671 (self.range.end - self.range.start) as usize
672 }
673 }
674
675 impl<T: ReadAt> decoder::SeqRead for SeqReadAtAdapter<T> {
676 fn poll_seq_read(
677 self: Pin<&mut Self>,
678 cx: &mut Context,
679 buf: &mut [u8],
680 ) -> Poll<io::Result<usize>> {
681 let len = buf.len().min(self.remaining());
682 let buf = &mut buf[..len];
683
684 let this = unsafe { self.get_unchecked_mut() };
685
686 let got = ready!(unsafe {
687 Pin::new_unchecked(&this.input).poll_read_at(cx, buf, this.range.start)
688 })?;
689 this.range.start += got as u64;
690 Poll::Ready(Ok(got))
691 }
692
693 fn poll_position(self: Pin<&mut Self>, _cx: &mut Context) -> Poll<Option<io::Result<u64>>> {
694 Poll::Ready(Some(Ok(self.range.start)))
695 }
696 }