]> git.proxmox.com Git - rustc.git/blame - library/std/src/sys/unix/kernel_copy.rs
New upstream version 1.68.2+dfsg1
[rustc.git] / library / std / src / sys / unix / kernel_copy.rs
CommitLineData
fc512014
XL
1//! This module contains specializations that can offload `io::copy()` operations on file descriptor
2//! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
3//!
4//! Specialization is only applied to wholly std-owned types so that user code can't observe
5//! that the `Read` and `Write` traits are not used.
6//!
7//! Since a copy operation involves a reader and writer side where each can consist of different types
8//! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9//! a single method on all possible combinations.
10//!
11//! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12//! traits and then specialized on by the `Copier::copy` method.
13//!
14//! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15//! additional prerequisites and constraints imposed by the wrapper types.
16//!
17//! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18//! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19//! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20//! Since those syscalls have requirements that cannot be fully checked in advance and
21//! gathering additional information about file descriptors would require additional syscalls
22//! anyway it simply attempts to use them one after another (guided by inaccurate hints) to
2b03887a 23//! figure out which one works and falls back to the generic read-write copy loop if none of them
fc512014
XL
24//! does.
25//! Once a working syscall is found for a pair of file descriptors it will be called in a loop
26//! until the copy operation is completed.
27//!
28//! Advantages of using these syscalls:
29//!
30//! * fewer context switches since reads and writes are coalesced into a single syscall
31//! and more bytes are transferred per syscall. This translates to higher throughput
32//! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
33//! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
34//! consuming less disk space
35//! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
36//! a naive copy loop would move every byte through the CPU.
37//!
38//! Drawbacks:
39//!
40//! * copy operations smaller than the default buffer size can under some circumstances, especially
41//! on older kernels, incur more syscalls than the naive approach would. As mentioned above
42//! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
43//! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
44//! progress, they can hit a performance cliff.
45//! * complexity
46
47use crate::cmp::min;
fc512014
XL
48use crate::fs::{File, Metadata};
49use crate::io::copy::generic_copy;
50use crate::io::{
51 BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take,
52 Write,
53};
54use crate::mem::ManuallyDrop;
55use crate::net::TcpStream;
56use crate::os::unix::fs::FileTypeExt;
57use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
58use crate::os::unix::net::UnixStream;
59use crate::process::{ChildStderr, ChildStdin, ChildStdout};
60use crate::ptr;
61use crate::sync::atomic::{AtomicBool, AtomicU8, Ordering};
62use crate::sys::cvt;
136023e0 63use crate::sys::weak::syscall;
9c376795
FG
64#[cfg(not(all(target_os = "linux", target_env = "gnu")))]
65use libc::sendfile as sendfile64;
66#[cfg(all(target_os = "linux", target_env = "gnu"))]
67use libc::sendfile64;
5869c6ff 68use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
fc512014
XL
69
70#[cfg(test)]
71mod tests;
72
73pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
74 read: &mut R,
75 write: &mut W,
76) -> Result<u64> {
77 let copier = Copier { read, write };
78 SpecCopy::copy(copier)
79}
80
81/// This type represents either the inferred `FileType` of a `RawFd` based on the source
82/// type from which it was extracted or the actual metadata
83///
84/// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
85/// type may be wrong.
86enum FdMeta {
87 /// We obtained the FD from a type that can contain any type of `FileType` and queried the metadata
88 /// because it is cheaper than probing all possible syscalls (reader side)
89 Metadata(Metadata),
90 Socket,
91 Pipe,
92 /// We don't have any metadata, e.g. because the original type was `File` which can represent
93 /// any `FileType` and we did not query the metadata either since it did not seem beneficial
94 /// (writer side)
95 NoneObtained,
96}
97
98impl FdMeta {
99 fn maybe_fifo(&self) -> bool {
100 match self {
101 FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
102 FdMeta::Socket => false,
103 FdMeta::Pipe => true,
104 FdMeta::NoneObtained => true,
105 }
106 }
107
108 fn potential_sendfile_source(&self) -> bool {
109 match self {
a2a8927a 110 // procfs erroneously shows 0 length on non-empty readable files.
fc512014
XL
111 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
112 // thus there would be benefit from attempting sendfile
113 FdMeta::Metadata(meta)
114 if meta.file_type().is_file() && meta.len() > 0
115 || meta.file_type().is_block_device() =>
116 {
117 true
118 }
119 _ => false,
120 }
121 }
122
123 fn copy_file_range_candidate(&self) -> bool {
124 match self {
125 // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
126 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
127 FdMeta::Metadata(meta) if meta.is_file() && meta.len() > 0 => true,
128 FdMeta::NoneObtained => true,
129 _ => false,
130 }
131 }
132}
133
134struct CopyParams(FdMeta, Option<RawFd>);
135
136struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
137 read: &'a mut R,
138 write: &'b mut W,
139}
140
141trait SpecCopy {
142 fn copy(self) -> Result<u64>;
143}
144
145impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> {
146 default fn copy(self) -> Result<u64> {
147 generic_copy(self.read, self.write)
148 }
149}
150
151impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> {
152 fn copy(self) -> Result<u64> {
153 let (reader, writer) = (self.read, self.write);
154 let r_cfg = reader.properties();
155 let w_cfg = writer.properties();
156
157 // before direct operations on file descriptors ensure that all source and sink buffers are empty
158 let mut flush = || -> crate::io::Result<u64> {
159 let bytes = reader.drain_to(writer, u64::MAX)?;
160 // BufWriter buffered bytes have already been accounted for in earlier write() calls
161 writer.flush()?;
162 Ok(bytes)
163 };
164
165 let mut written = 0u64;
166
167 if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
168 (r_cfg, w_cfg)
169 {
170 written += flush()?;
171 let max_write = reader.min_limit();
172
173 if input_meta.copy_file_range_candidate() && output_meta.copy_file_range_candidate() {
174 let result = copy_regular_files(readfd, writefd, max_write);
175 result.update_take(reader);
176
177 match result {
178 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
179 CopyResult::Error(e, _) => return Err(e),
180 CopyResult::Fallback(bytes) => written += bytes,
181 }
182 }
183
184 // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
185 // to any writable file descriptor. On older kernels the writer side can only be a socket.
186 // So we just try and fallback if needed.
187 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
188 // fall back to the generic copy loop.
189 if input_meta.potential_sendfile_source() {
190 let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
191 result.update_take(reader);
192
193 match result {
194 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
195 CopyResult::Error(e, _) => return Err(e),
196 CopyResult::Fallback(bytes) => written += bytes,
197 }
198 }
199
200 if input_meta.maybe_fifo() || output_meta.maybe_fifo() {
201 let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
202 result.update_take(reader);
203
204 match result {
205 CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written),
206 CopyResult::Error(e, _) => return Err(e),
207 CopyResult::Fallback(0) => { /* use the fallback below */ }
208 CopyResult::Fallback(_) => {
209 unreachable!("splice should not return > 0 bytes on the fallback path")
210 }
211 }
212 }
213 }
214
215 // fallback if none of the more specialized syscalls wants to work with these file descriptors
216 match generic_copy(reader, writer) {
217 Ok(bytes) => Ok(bytes + written),
218 err => err,
219 }
220 }
221}
222
223#[rustc_specialization_trait]
224trait CopyRead: Read {
225 /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
226 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
227 /// transferred, whichever occurs sooner.
228 /// If nested buffers are present the outer buffers must be drained first.
229 ///
230 /// This is necessary to directly bypass the wrapper types while preserving the data order
231 /// when operating directly on the underlying file descriptors.
232 fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
233 Ok(0)
234 }
235
236 /// Updates `Take` wrappers to remove the number of bytes copied.
237 fn taken(&mut self, _bytes: u64) {}
238
239 /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
240 /// This method does not account for data `BufReader` buffers and would underreport
241 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
242 /// after draining the buffers via `drain_to`.
243 fn min_limit(&self) -> u64 {
244 u64::MAX
245 }
246
247 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
248 fn properties(&self) -> CopyParams;
249}
250
251#[rustc_specialization_trait]
252trait CopyWrite: Write {
253 /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
254 fn properties(&self) -> CopyParams;
255}
256
257impl<T> CopyRead for &mut T
258where
259 T: CopyRead,
260{
261 fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
262 (**self).drain_to(writer, limit)
263 }
264
265 fn taken(&mut self, bytes: u64) {
266 (**self).taken(bytes);
267 }
268
269 fn min_limit(&self) -> u64 {
270 (**self).min_limit()
271 }
272
273 fn properties(&self) -> CopyParams {
274 (**self).properties()
275 }
276}
277
278impl<T> CopyWrite for &mut T
279where
280 T: CopyWrite,
281{
282 fn properties(&self) -> CopyParams {
283 (**self).properties()
284 }
285}
286
287impl CopyRead for File {
288 fn properties(&self) -> CopyParams {
289 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
290 }
291}
292
293impl CopyRead for &File {
294 fn properties(&self) -> CopyParams {
295 CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
296 }
297}
298
299impl CopyWrite for File {
300 fn properties(&self) -> CopyParams {
301 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
302 }
303}
304
305impl CopyWrite for &File {
306 fn properties(&self) -> CopyParams {
307 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
308 }
309}
310
311impl CopyRead for TcpStream {
312 fn properties(&self) -> CopyParams {
313 // avoid the stat syscall since we can be fairly sure it's a socket
314 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
315 }
316}
317
318impl CopyRead for &TcpStream {
319 fn properties(&self) -> CopyParams {
320 // avoid the stat syscall since we can be fairly sure it's a socket
321 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
322 }
323}
324
325impl CopyWrite for TcpStream {
326 fn properties(&self) -> CopyParams {
327 // avoid the stat syscall since we can be fairly sure it's a socket
328 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
329 }
330}
331
332impl CopyWrite for &TcpStream {
333 fn properties(&self) -> CopyParams {
334 // avoid the stat syscall since we can be fairly sure it's a socket
335 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
336 }
337}
338
339impl CopyRead for UnixStream {
340 fn properties(&self) -> CopyParams {
341 // avoid the stat syscall since we can be fairly sure it's a socket
342 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
343 }
344}
345
346impl CopyRead for &UnixStream {
347 fn properties(&self) -> CopyParams {
348 // avoid the stat syscall since we can be fairly sure it's a socket
349 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
350 }
351}
352
353impl CopyWrite for UnixStream {
354 fn properties(&self) -> CopyParams {
355 // avoid the stat syscall since we can be fairly sure it's a socket
356 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
357 }
358}
359
360impl CopyWrite for &UnixStream {
361 fn properties(&self) -> CopyParams {
362 // avoid the stat syscall since we can be fairly sure it's a socket
363 CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
364 }
365}
366
367impl CopyWrite for ChildStdin {
368 fn properties(&self) -> CopyParams {
369 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
370 }
371}
372
373impl CopyRead for ChildStdout {
374 fn properties(&self) -> CopyParams {
375 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
376 }
377}
378
379impl CopyRead for ChildStderr {
380 fn properties(&self) -> CopyParams {
381 CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
382 }
383}
384
385impl CopyRead for StdinLock<'_> {
386 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
387 let buf_reader = self.as_mut_buf();
388 let buf = buf_reader.buffer();
389 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
390 let bytes_drained = buf.len();
391 writer.write_all(buf)?;
392 buf_reader.consume(bytes_drained);
393
394 Ok(bytes_drained as u64)
395 }
396
397 fn properties(&self) -> CopyParams {
398 CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
399 }
400}
401
402impl CopyWrite for StdoutLock<'_> {
403 fn properties(&self) -> CopyParams {
404 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
405 }
406}
407
408impl CopyWrite for StderrLock<'_> {
409 fn properties(&self) -> CopyParams {
410 CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
411 }
412}
413
414impl<T: CopyRead> CopyRead for Take<T> {
415 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
416 let local_limit = self.limit();
417 let combined_limit = min(outer_limit, local_limit);
418 let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?;
419 // update limit since read() was bypassed
420 self.set_limit(local_limit - bytes_drained);
421
422 Ok(bytes_drained)
423 }
424
425 fn taken(&mut self, bytes: u64) {
426 self.set_limit(self.limit() - bytes);
427 self.get_mut().taken(bytes);
428 }
429
430 fn min_limit(&self) -> u64 {
431 min(Take::limit(self), self.get_ref().min_limit())
432 }
433
434 fn properties(&self) -> CopyParams {
435 self.get_ref().properties()
436 }
437}
438
439impl<T: CopyRead> CopyRead for BufReader<T> {
440 fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
441 let buf = self.buffer();
442 let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
443 let bytes = buf.len();
444 writer.write_all(buf)?;
445 self.consume(bytes);
446
447 let remaining = outer_limit - bytes as u64;
448
449 // in case of nested bufreaders we also need to drain the ones closer to the source
450 let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
451
452 Ok(bytes as u64 + inner_bytes)
453 }
454
455 fn taken(&mut self, bytes: u64) {
456 self.get_mut().taken(bytes);
457 }
458
459 fn min_limit(&self) -> u64 {
460 self.get_ref().min_limit()
461 }
462
463 fn properties(&self) -> CopyParams {
464 self.get_ref().properties()
465 }
466}
467
468impl<T: CopyWrite> CopyWrite for BufWriter<T> {
469 fn properties(&self) -> CopyParams {
470 self.get_ref().properties()
471 }
472}
473
474fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
475 let fd = fd.as_raw_fd();
476 let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
477 match file.metadata() {
478 Ok(meta) => FdMeta::Metadata(meta),
479 Err(_) => FdMeta::NoneObtained,
480 }
481}
482
483pub(super) enum CopyResult {
484 Ended(u64),
485 Error(Error, u64),
486 Fallback(u64),
487}
488
489impl CopyResult {
490 fn update_take(&self, reader: &mut impl CopyRead) {
491 match *self {
492 CopyResult::Fallback(bytes)
493 | CopyResult::Ended(bytes)
494 | CopyResult::Error(_, bytes) => reader.taken(bytes),
495 }
496 }
497}
498
499/// Invalid file descriptor.
500///
501/// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
502/// while negative values are used to indicate errors.
503/// Thus -1 will never be overlap with a valid open file.
504const INVALID_FD: RawFd = -1;
505
506/// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
507/// As the name says, it only works on regular files.
508///
509/// Callers must handle fallback to a generic copy loop.
510/// `Fallback` may indicate non-zero number of bytes already written
511/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
512pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
513 use crate::cmp;
514
515 const NOT_PROBED: u8 = 0;
516 const UNAVAILABLE: u8 = 1;
517 const AVAILABLE: u8 = 2;
518
519 // Kernel prior to 4.5 don't have copy_file_range
520 // We store the availability in a global to avoid unnecessary syscalls
521 static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED);
522
523 syscall! {
524 fn copy_file_range(
525 fd_in: libc::c_int,
526 off_in: *mut libc::loff_t,
527 fd_out: libc::c_int,
528 off_out: *mut libc::loff_t,
529 len: libc::size_t,
530 flags: libc::c_uint
531 ) -> libc::ssize_t
532 }
533
534 match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
535 NOT_PROBED => {
536 // EPERM can indicate seccomp filters or an immutable file.
537 // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported
538 // and some other error (ENOSYS or EPERM) if it's not available
539 let result = unsafe {
540 cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0))
541 };
542
5869c6ff 543 if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) {
fc512014
XL
544 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
545 } else {
546 HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed);
547 return CopyResult::Fallback(0);
548 }
549 }
550 UNAVAILABLE => return CopyResult::Fallback(0),
551 _ => {}
552 };
553
554 let mut written = 0u64;
555 while written < max_len {
556 let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
557 // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
558 // this allows us to copy large chunks without hitting EOVERFLOW,
559 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
560 let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
561 let copy_result = unsafe {
562 // We actually don't have to adjust the offsets,
563 // because copy_file_range adjusts the file offset automatically
564 cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0))
565 };
566
567 match copy_result {
568 Ok(0) if written == 0 => {
569 // fallback to work around several kernel bugs where copy_file_range will fail to
570 // copy any bytes and return 0 instead of an error if
571 // - reading virtual files from the proc filesystem which appear to have 0 size
572 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
573 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
574 return CopyResult::Fallback(0);
575 }
576 Ok(0) => return CopyResult::Ended(written), // reached EOF
577 Ok(ret) => written += ret as u64,
578 Err(err) => {
579 return match err.raw_os_error() {
580 // when file offset + max_length > u64::MAX
5869c6ff 581 Some(EOVERFLOW) => CopyResult::Fallback(written),
3c0e092e 582 Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => {
fc512014
XL
583 // Try fallback io::copy if either:
584 // - Kernel version is < 4.5 (ENOSYS¹)
585 // - Files are mounted on different fs (EXDEV)
586 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
587 // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
588 // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
5869c6ff 589 // - the writer fd was opened with O_APPEND (EBADF²)
9c376795 590 // and no bytes were written successfully yet. (All these errnos should
3c0e092e
XL
591 // not be returned if something was already written, but they happen in
592 // the wild, see #91152.)
fc512014
XL
593 //
594 // ¹ these cases should be detected by the initial probe but we handle them here
595 // anyway in case syscall interception changes during runtime
5869c6ff
XL
596 // ² actually invalid file descriptors would cause this too, but in that case
597 // the fallback code path is expected to encounter the same error again
fc512014
XL
598 CopyResult::Fallback(0)
599 }
600 _ => CopyResult::Error(err, written),
601 };
602 }
603 }
604 }
605 CopyResult::Ended(written)
606}
607
608#[derive(PartialEq)]
609enum SpliceMode {
610 Sendfile,
611 Splice,
612}
613
614/// performs splice or sendfile between file descriptors
615/// Does _not_ fall back to a generic copy loop.
616fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
617 static HAS_SENDFILE: AtomicBool = AtomicBool::new(true);
618 static HAS_SPLICE: AtomicBool = AtomicBool::new(true);
619
a2a8927a
XL
620 // Android builds use feature level 14, but the libc wrapper for splice is
621 // gated on feature level 21+, so we have to invoke the syscall directly.
622 #[cfg(target_os = "android")]
fc512014
XL
623 syscall! {
624 fn splice(
625 srcfd: libc::c_int,
626 src_offset: *const i64,
627 dstfd: libc::c_int,
628 dst_offset: *const i64,
629 len: libc::size_t,
630 flags: libc::c_int
631 ) -> libc::ssize_t
632 }
633
a2a8927a
XL
634 #[cfg(target_os = "linux")]
635 use libc::splice;
636
fc512014
XL
637 match mode {
638 SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => {
639 return CopyResult::Fallback(0);
640 }
641 SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => {
642 return CopyResult::Fallback(0);
643 }
644 _ => (),
645 }
646
647 let mut written = 0u64;
648 while written < len {
649 // according to its manpage that's the maximum size sendfile() will copy per invocation
650 let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
651
652 let result = match mode {
653 SpliceMode::Sendfile => {
9c376795 654 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) })
fc512014
XL
655 }
656 SpliceMode::Splice => cvt(unsafe {
657 splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
658 }),
659 };
660
661 match result {
662 Ok(0) => break, // EOF
663 Ok(ret) => written += ret as u64,
664 Err(err) => {
665 return match err.raw_os_error() {
5869c6ff 666 Some(ENOSYS | EPERM) => {
fc512014
XL
667 // syscall not supported (ENOSYS)
668 // syscall is disallowed, e.g. by seccomp (EPERM)
669 match mode {
670 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
671 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
672 }
673 assert_eq!(written, 0);
674 CopyResult::Fallback(0)
675 }
5869c6ff 676 Some(EINVAL) => {
fc512014
XL
677 // splice/sendfile do not support this particular file descriptor (EINVAL)
678 assert_eq!(written, 0);
679 CopyResult::Fallback(0)
680 }
5869c6ff 681 Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
fc512014
XL
682 CopyResult::Fallback(written)
683 }
684 _ => CopyResult::Error(err, written),
685 };
686 }
687 }
688 }
689 CopyResult::Ended(written)
690}