]>
Commit | Line | Data |
---|---|---|
fc512014 XL |
1 | //! This module contains specializations that can offload `io::copy()` operations on file descriptor |
2 | //! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`. | |
3 | //! | |
4 | //! Specialization is only applied to wholly std-owned types so that user code can't observe | |
5 | //! that the `Read` and `Write` traits are not used. | |
6 | //! | |
7 | //! Since a copy operation involves a reader and writer side where each can consist of different types | |
8 | //! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize | |
9 | //! a single method on all possible combinations. | |
10 | //! | |
11 | //! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization | |
12 | //! traits and then specialized on by the `Copier::copy` method. | |
13 | //! | |
14 | //! `Copier` uses the specialization traits to unpack the underlying file descriptors and | |
15 | //! additional prerequisites and constraints imposed by the wrapper types. | |
16 | //! | |
17 | //! Once it has obtained all necessary pieces and brought any wrapper types into a state where they | |
18 | //! can be safely bypassed it will attempt to use the `copy_file_range(2)`, | |
19 | //! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors. | |
20 | //! Since those syscalls have requirements that cannot be fully checked in advance and | |
21 | //! gathering additional information about file descriptors would require additional syscalls | |
22 | //! anyway it simply attempts to use them one after another (guided by inaccurate hints) to | |
2b03887a | 23 | //! figure out which one works and falls back to the generic read-write copy loop if none of them |
fc512014 XL |
24 | //! does. |
25 | //! Once a working syscall is found for a pair of file descriptors it will be called in a loop | |
26 | //! until the copy operation is completed. | |
27 | //! | |
28 | //! Advantages of using these syscalls: | |
29 | //! | |
30 | //! * fewer context switches since reads and writes are coalesced into a single syscall | |
31 | //! and more bytes are transferred per syscall. This translates to higher throughput | |
32 | //! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing. | |
33 | //! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and | |
34 | //! consuming less disk space | |
35 | //! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while | |
36 | //! a naive copy loop would move every byte through the CPU. | |
37 | //! | |
38 | //! Drawbacks: | |
39 | //! | |
40 | //! * copy operations smaller than the default buffer size can under some circumstances, especially | |
41 | //! on older kernels, incur more syscalls than the naive approach would. As mentioned above | |
42 | //! the syscall selection is guided by hints to minimize this possibility but they are not perfect. | |
43 | //! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report | |
44 | //! progress, they can hit a performance cliff. | |
45 | //! * complexity | |
46 | ||
47 | use crate::cmp::min; | |
fc512014 XL |
48 | use crate::fs::{File, Metadata}; |
49 | use crate::io::copy::generic_copy; | |
50 | use crate::io::{ | |
51 | BufRead, BufReader, BufWriter, Error, Read, Result, StderrLock, StdinLock, StdoutLock, Take, | |
52 | Write, | |
53 | }; | |
54 | use crate::mem::ManuallyDrop; | |
55 | use crate::net::TcpStream; | |
56 | use crate::os::unix::fs::FileTypeExt; | |
57 | use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd}; | |
58 | use crate::os::unix::net::UnixStream; | |
59 | use crate::process::{ChildStderr, ChildStdin, ChildStdout}; | |
60 | use crate::ptr; | |
61 | use crate::sync::atomic::{AtomicBool, AtomicU8, Ordering}; | |
62 | use crate::sys::cvt; | |
136023e0 | 63 | use crate::sys::weak::syscall; |
9c376795 FG |
64 | #[cfg(not(all(target_os = "linux", target_env = "gnu")))] |
65 | use libc::sendfile as sendfile64; | |
66 | #[cfg(all(target_os = "linux", target_env = "gnu"))] | |
67 | use libc::sendfile64; | |
5869c6ff | 68 | use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV}; |
fc512014 XL |
69 | |
70 | #[cfg(test)] | |
71 | mod tests; | |
72 | ||
73 | pub(crate) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>( | |
74 | read: &mut R, | |
75 | write: &mut W, | |
76 | ) -> Result<u64> { | |
77 | let copier = Copier { read, write }; | |
78 | SpecCopy::copy(copier) | |
79 | } | |
80 | ||
81 | /// This type represents either the inferred `FileType` of a `RawFd` based on the source | |
82 | /// type from which it was extracted or the actual metadata | |
83 | /// | |
84 | /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred | |
85 | /// type may be wrong. | |
86 | enum FdMeta { | |
87 | /// We obtained the FD from a type that can contain any type of `FileType` and queried the metadata | |
88 | /// because it is cheaper than probing all possible syscalls (reader side) | |
89 | Metadata(Metadata), | |
90 | Socket, | |
91 | Pipe, | |
92 | /// We don't have any metadata, e.g. because the original type was `File` which can represent | |
93 | /// any `FileType` and we did not query the metadata either since it did not seem beneficial | |
94 | /// (writer side) | |
95 | NoneObtained, | |
96 | } | |
97 | ||
98 | impl FdMeta { | |
99 | fn maybe_fifo(&self) -> bool { | |
100 | match self { | |
101 | FdMeta::Metadata(meta) => meta.file_type().is_fifo(), | |
102 | FdMeta::Socket => false, | |
103 | FdMeta::Pipe => true, | |
104 | FdMeta::NoneObtained => true, | |
105 | } | |
106 | } | |
107 | ||
108 | fn potential_sendfile_source(&self) -> bool { | |
109 | match self { | |
a2a8927a | 110 | // procfs erroneously shows 0 length on non-empty readable files. |
fc512014 XL |
111 | // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall |
112 | // thus there would be benefit from attempting sendfile | |
113 | FdMeta::Metadata(meta) | |
114 | if meta.file_type().is_file() && meta.len() > 0 | |
115 | || meta.file_type().is_block_device() => | |
116 | { | |
117 | true | |
118 | } | |
119 | _ => false, | |
120 | } | |
121 | } | |
122 | ||
123 | fn copy_file_range_candidate(&self) -> bool { | |
124 | match self { | |
125 | // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached | |
126 | // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range | |
127 | FdMeta::Metadata(meta) if meta.is_file() && meta.len() > 0 => true, | |
128 | FdMeta::NoneObtained => true, | |
129 | _ => false, | |
130 | } | |
131 | } | |
132 | } | |
133 | ||
134 | struct CopyParams(FdMeta, Option<RawFd>); | |
135 | ||
136 | struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> { | |
137 | read: &'a mut R, | |
138 | write: &'b mut W, | |
139 | } | |
140 | ||
141 | trait SpecCopy { | |
142 | fn copy(self) -> Result<u64>; | |
143 | } | |
144 | ||
145 | impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopy for Copier<'_, '_, R, W> { | |
146 | default fn copy(self) -> Result<u64> { | |
147 | generic_copy(self.read, self.write) | |
148 | } | |
149 | } | |
150 | ||
151 | impl<R: CopyRead, W: CopyWrite> SpecCopy for Copier<'_, '_, R, W> { | |
152 | fn copy(self) -> Result<u64> { | |
153 | let (reader, writer) = (self.read, self.write); | |
154 | let r_cfg = reader.properties(); | |
155 | let w_cfg = writer.properties(); | |
156 | ||
157 | // before direct operations on file descriptors ensure that all source and sink buffers are empty | |
158 | let mut flush = || -> crate::io::Result<u64> { | |
159 | let bytes = reader.drain_to(writer, u64::MAX)?; | |
160 | // BufWriter buffered bytes have already been accounted for in earlier write() calls | |
161 | writer.flush()?; | |
162 | Ok(bytes) | |
163 | }; | |
164 | ||
165 | let mut written = 0u64; | |
166 | ||
167 | if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) = | |
168 | (r_cfg, w_cfg) | |
169 | { | |
170 | written += flush()?; | |
171 | let max_write = reader.min_limit(); | |
172 | ||
173 | if input_meta.copy_file_range_candidate() && output_meta.copy_file_range_candidate() { | |
174 | let result = copy_regular_files(readfd, writefd, max_write); | |
175 | result.update_take(reader); | |
176 | ||
177 | match result { | |
178 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), | |
179 | CopyResult::Error(e, _) => return Err(e), | |
180 | CopyResult::Fallback(bytes) => written += bytes, | |
181 | } | |
182 | } | |
183 | ||
184 | // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices) | |
185 | // to any writable file descriptor. On older kernels the writer side can only be a socket. | |
186 | // So we just try and fallback if needed. | |
187 | // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead | |
188 | // fall back to the generic copy loop. | |
189 | if input_meta.potential_sendfile_source() { | |
190 | let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write); | |
191 | result.update_take(reader); | |
192 | ||
193 | match result { | |
194 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), | |
195 | CopyResult::Error(e, _) => return Err(e), | |
196 | CopyResult::Fallback(bytes) => written += bytes, | |
197 | } | |
198 | } | |
199 | ||
200 | if input_meta.maybe_fifo() || output_meta.maybe_fifo() { | |
201 | let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write); | |
202 | result.update_take(reader); | |
203 | ||
204 | match result { | |
205 | CopyResult::Ended(bytes_copied) => return Ok(bytes_copied + written), | |
206 | CopyResult::Error(e, _) => return Err(e), | |
207 | CopyResult::Fallback(0) => { /* use the fallback below */ } | |
208 | CopyResult::Fallback(_) => { | |
209 | unreachable!("splice should not return > 0 bytes on the fallback path") | |
210 | } | |
211 | } | |
212 | } | |
213 | } | |
214 | ||
215 | // fallback if none of the more specialized syscalls wants to work with these file descriptors | |
216 | match generic_copy(reader, writer) { | |
217 | Ok(bytes) => Ok(bytes + written), | |
218 | err => err, | |
219 | } | |
220 | } | |
221 | } | |
222 | ||
223 | #[rustc_specialization_trait] | |
224 | trait CopyRead: Read { | |
225 | /// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal | |
226 | /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been | |
227 | /// transferred, whichever occurs sooner. | |
228 | /// If nested buffers are present the outer buffers must be drained first. | |
229 | /// | |
230 | /// This is necessary to directly bypass the wrapper types while preserving the data order | |
231 | /// when operating directly on the underlying file descriptors. | |
232 | fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> { | |
233 | Ok(0) | |
234 | } | |
235 | ||
236 | /// Updates `Take` wrappers to remove the number of bytes copied. | |
237 | fn taken(&mut self, _bytes: u64) {} | |
238 | ||
239 | /// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise. | |
240 | /// This method does not account for data `BufReader` buffers and would underreport | |
241 | /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid | |
242 | /// after draining the buffers via `drain_to`. | |
243 | fn min_limit(&self) -> u64 { | |
244 | u64::MAX | |
245 | } | |
246 | ||
247 | /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. | |
248 | fn properties(&self) -> CopyParams; | |
249 | } | |
250 | ||
251 | #[rustc_specialization_trait] | |
252 | trait CopyWrite: Write { | |
253 | /// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary. | |
254 | fn properties(&self) -> CopyParams; | |
255 | } | |
256 | ||
257 | impl<T> CopyRead for &mut T | |
258 | where | |
259 | T: CopyRead, | |
260 | { | |
261 | fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> { | |
262 | (**self).drain_to(writer, limit) | |
263 | } | |
264 | ||
265 | fn taken(&mut self, bytes: u64) { | |
266 | (**self).taken(bytes); | |
267 | } | |
268 | ||
269 | fn min_limit(&self) -> u64 { | |
270 | (**self).min_limit() | |
271 | } | |
272 | ||
273 | fn properties(&self) -> CopyParams { | |
274 | (**self).properties() | |
275 | } | |
276 | } | |
277 | ||
278 | impl<T> CopyWrite for &mut T | |
279 | where | |
280 | T: CopyWrite, | |
281 | { | |
282 | fn properties(&self) -> CopyParams { | |
283 | (**self).properties() | |
284 | } | |
285 | } | |
286 | ||
287 | impl CopyRead for File { | |
288 | fn properties(&self) -> CopyParams { | |
289 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) | |
290 | } | |
291 | } | |
292 | ||
293 | impl CopyRead for &File { | |
294 | fn properties(&self) -> CopyParams { | |
295 | CopyParams(fd_to_meta(*self), Some(self.as_raw_fd())) | |
296 | } | |
297 | } | |
298 | ||
299 | impl CopyWrite for File { | |
300 | fn properties(&self) -> CopyParams { | |
301 | CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd())) | |
302 | } | |
303 | } | |
304 | ||
305 | impl CopyWrite for &File { | |
306 | fn properties(&self) -> CopyParams { | |
307 | CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd())) | |
308 | } | |
309 | } | |
310 | ||
311 | impl CopyRead for TcpStream { | |
312 | fn properties(&self) -> CopyParams { | |
313 | // avoid the stat syscall since we can be fairly sure it's a socket | |
314 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
315 | } | |
316 | } | |
317 | ||
318 | impl CopyRead for &TcpStream { | |
319 | fn properties(&self) -> CopyParams { | |
320 | // avoid the stat syscall since we can be fairly sure it's a socket | |
321 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
322 | } | |
323 | } | |
324 | ||
325 | impl CopyWrite for TcpStream { | |
326 | fn properties(&self) -> CopyParams { | |
327 | // avoid the stat syscall since we can be fairly sure it's a socket | |
328 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
329 | } | |
330 | } | |
331 | ||
332 | impl CopyWrite for &TcpStream { | |
333 | fn properties(&self) -> CopyParams { | |
334 | // avoid the stat syscall since we can be fairly sure it's a socket | |
335 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
336 | } | |
337 | } | |
338 | ||
339 | impl CopyRead for UnixStream { | |
340 | fn properties(&self) -> CopyParams { | |
341 | // avoid the stat syscall since we can be fairly sure it's a socket | |
342 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
343 | } | |
344 | } | |
345 | ||
346 | impl CopyRead for &UnixStream { | |
347 | fn properties(&self) -> CopyParams { | |
348 | // avoid the stat syscall since we can be fairly sure it's a socket | |
349 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
350 | } | |
351 | } | |
352 | ||
353 | impl CopyWrite for UnixStream { | |
354 | fn properties(&self) -> CopyParams { | |
355 | // avoid the stat syscall since we can be fairly sure it's a socket | |
356 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
357 | } | |
358 | } | |
359 | ||
360 | impl CopyWrite for &UnixStream { | |
361 | fn properties(&self) -> CopyParams { | |
362 | // avoid the stat syscall since we can be fairly sure it's a socket | |
363 | CopyParams(FdMeta::Socket, Some(self.as_raw_fd())) | |
364 | } | |
365 | } | |
366 | ||
367 | impl CopyWrite for ChildStdin { | |
368 | fn properties(&self) -> CopyParams { | |
369 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) | |
370 | } | |
371 | } | |
372 | ||
373 | impl CopyRead for ChildStdout { | |
374 | fn properties(&self) -> CopyParams { | |
375 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) | |
376 | } | |
377 | } | |
378 | ||
379 | impl CopyRead for ChildStderr { | |
380 | fn properties(&self) -> CopyParams { | |
381 | CopyParams(FdMeta::Pipe, Some(self.as_raw_fd())) | |
382 | } | |
383 | } | |
384 | ||
385 | impl CopyRead for StdinLock<'_> { | |
386 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { | |
387 | let buf_reader = self.as_mut_buf(); | |
388 | let buf = buf_reader.buffer(); | |
389 | let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; | |
390 | let bytes_drained = buf.len(); | |
391 | writer.write_all(buf)?; | |
392 | buf_reader.consume(bytes_drained); | |
393 | ||
394 | Ok(bytes_drained as u64) | |
395 | } | |
396 | ||
397 | fn properties(&self) -> CopyParams { | |
398 | CopyParams(fd_to_meta(self), Some(self.as_raw_fd())) | |
399 | } | |
400 | } | |
401 | ||
402 | impl CopyWrite for StdoutLock<'_> { | |
403 | fn properties(&self) -> CopyParams { | |
404 | CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd())) | |
405 | } | |
406 | } | |
407 | ||
408 | impl CopyWrite for StderrLock<'_> { | |
409 | fn properties(&self) -> CopyParams { | |
410 | CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd())) | |
411 | } | |
412 | } | |
413 | ||
414 | impl<T: CopyRead> CopyRead for Take<T> { | |
415 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { | |
416 | let local_limit = self.limit(); | |
417 | let combined_limit = min(outer_limit, local_limit); | |
418 | let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?; | |
419 | // update limit since read() was bypassed | |
420 | self.set_limit(local_limit - bytes_drained); | |
421 | ||
422 | Ok(bytes_drained) | |
423 | } | |
424 | ||
425 | fn taken(&mut self, bytes: u64) { | |
426 | self.set_limit(self.limit() - bytes); | |
427 | self.get_mut().taken(bytes); | |
428 | } | |
429 | ||
430 | fn min_limit(&self) -> u64 { | |
431 | min(Take::limit(self), self.get_ref().min_limit()) | |
432 | } | |
433 | ||
434 | fn properties(&self) -> CopyParams { | |
435 | self.get_ref().properties() | |
436 | } | |
437 | } | |
438 | ||
439 | impl<T: CopyRead> CopyRead for BufReader<T> { | |
440 | fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> { | |
441 | let buf = self.buffer(); | |
442 | let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))]; | |
443 | let bytes = buf.len(); | |
444 | writer.write_all(buf)?; | |
445 | self.consume(bytes); | |
446 | ||
447 | let remaining = outer_limit - bytes as u64; | |
448 | ||
449 | // in case of nested bufreaders we also need to drain the ones closer to the source | |
450 | let inner_bytes = self.get_mut().drain_to(writer, remaining)?; | |
451 | ||
452 | Ok(bytes as u64 + inner_bytes) | |
453 | } | |
454 | ||
455 | fn taken(&mut self, bytes: u64) { | |
456 | self.get_mut().taken(bytes); | |
457 | } | |
458 | ||
459 | fn min_limit(&self) -> u64 { | |
460 | self.get_ref().min_limit() | |
461 | } | |
462 | ||
463 | fn properties(&self) -> CopyParams { | |
464 | self.get_ref().properties() | |
465 | } | |
466 | } | |
467 | ||
468 | impl<T: CopyWrite> CopyWrite for BufWriter<T> { | |
469 | fn properties(&self) -> CopyParams { | |
470 | self.get_ref().properties() | |
471 | } | |
472 | } | |
473 | ||
474 | fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta { | |
475 | let fd = fd.as_raw_fd(); | |
476 | let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) }); | |
477 | match file.metadata() { | |
478 | Ok(meta) => FdMeta::Metadata(meta), | |
479 | Err(_) => FdMeta::NoneObtained, | |
480 | } | |
481 | } | |
482 | ||
483 | pub(super) enum CopyResult { | |
484 | Ended(u64), | |
485 | Error(Error, u64), | |
486 | Fallback(u64), | |
487 | } | |
488 | ||
489 | impl CopyResult { | |
490 | fn update_take(&self, reader: &mut impl CopyRead) { | |
491 | match *self { | |
492 | CopyResult::Fallback(bytes) | |
493 | | CopyResult::Ended(bytes) | |
494 | | CopyResult::Error(_, bytes) => reader.taken(bytes), | |
495 | } | |
496 | } | |
497 | } | |
498 | ||
499 | /// Invalid file descriptor. | |
500 | /// | |
501 | /// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage) | |
502 | /// while negative values are used to indicate errors. | |
503 | /// Thus -1 will never be overlap with a valid open file. | |
504 | const INVALID_FD: RawFd = -1; | |
505 | ||
506 | /// Linux-specific implementation that will attempt to use copy_file_range for copy offloading. | |
507 | /// As the name says, it only works on regular files. | |
508 | /// | |
509 | /// Callers must handle fallback to a generic copy loop. | |
510 | /// `Fallback` may indicate non-zero number of bytes already written | |
511 | /// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`). | |
512 | pub(super) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult { | |
513 | use crate::cmp; | |
514 | ||
515 | const NOT_PROBED: u8 = 0; | |
516 | const UNAVAILABLE: u8 = 1; | |
517 | const AVAILABLE: u8 = 2; | |
518 | ||
519 | // Kernel prior to 4.5 don't have copy_file_range | |
520 | // We store the availability in a global to avoid unnecessary syscalls | |
521 | static HAS_COPY_FILE_RANGE: AtomicU8 = AtomicU8::new(NOT_PROBED); | |
522 | ||
523 | syscall! { | |
524 | fn copy_file_range( | |
525 | fd_in: libc::c_int, | |
526 | off_in: *mut libc::loff_t, | |
527 | fd_out: libc::c_int, | |
528 | off_out: *mut libc::loff_t, | |
529 | len: libc::size_t, | |
530 | flags: libc::c_uint | |
531 | ) -> libc::ssize_t | |
532 | } | |
533 | ||
534 | match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) { | |
535 | NOT_PROBED => { | |
536 | // EPERM can indicate seccomp filters or an immutable file. | |
537 | // To distinguish these cases we probe with invalid file descriptors which should result in EBADF if the syscall is supported | |
538 | // and some other error (ENOSYS or EPERM) if it's not available | |
539 | let result = unsafe { | |
540 | cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0)) | |
541 | }; | |
542 | ||
5869c6ff | 543 | if matches!(result.map_err(|e| e.raw_os_error()), Err(Some(EBADF))) { |
fc512014 XL |
544 | HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed); |
545 | } else { | |
546 | HAS_COPY_FILE_RANGE.store(UNAVAILABLE, Ordering::Relaxed); | |
547 | return CopyResult::Fallback(0); | |
548 | } | |
549 | } | |
550 | UNAVAILABLE => return CopyResult::Fallback(0), | |
551 | _ => {} | |
552 | }; | |
553 | ||
554 | let mut written = 0u64; | |
555 | while written < max_len { | |
556 | let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64); | |
557 | // cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position | |
558 | // this allows us to copy large chunks without hitting EOVERFLOW, | |
559 | // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required | |
560 | let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize); | |
561 | let copy_result = unsafe { | |
562 | // We actually don't have to adjust the offsets, | |
563 | // because copy_file_range adjusts the file offset automatically | |
564 | cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0)) | |
565 | }; | |
566 | ||
567 | match copy_result { | |
568 | Ok(0) if written == 0 => { | |
569 | // fallback to work around several kernel bugs where copy_file_range will fail to | |
570 | // copy any bytes and return 0 instead of an error if | |
571 | // - reading virtual files from the proc filesystem which appear to have 0 size | |
572 | // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19. | |
573 | // - copying from an overlay filesystem in docker. reported to occur on fedora 32. | |
574 | return CopyResult::Fallback(0); | |
575 | } | |
576 | Ok(0) => return CopyResult::Ended(written), // reached EOF | |
577 | Ok(ret) => written += ret as u64, | |
578 | Err(err) => { | |
579 | return match err.raw_os_error() { | |
580 | // when file offset + max_length > u64::MAX | |
5869c6ff | 581 | Some(EOVERFLOW) => CopyResult::Fallback(written), |
3c0e092e | 582 | Some(ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF) if written == 0 => { |
fc512014 XL |
583 | // Try fallback io::copy if either: |
584 | // - Kernel version is < 4.5 (ENOSYS¹) | |
585 | // - Files are mounted on different fs (EXDEV) | |
586 | // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP) | |
587 | // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM) | |
588 | // - copy_file_range cannot be used with pipes or device nodes (EINVAL) | |
5869c6ff | 589 | // - the writer fd was opened with O_APPEND (EBADF²) |
9c376795 | 590 | // and no bytes were written successfully yet. (All these errnos should |
3c0e092e XL |
591 | // not be returned if something was already written, but they happen in |
592 | // the wild, see #91152.) | |
fc512014 XL |
593 | // |
594 | // ¹ these cases should be detected by the initial probe but we handle them here | |
595 | // anyway in case syscall interception changes during runtime | |
5869c6ff XL |
596 | // ² actually invalid file descriptors would cause this too, but in that case |
597 | // the fallback code path is expected to encounter the same error again | |
fc512014 XL |
598 | CopyResult::Fallback(0) |
599 | } | |
600 | _ => CopyResult::Error(err, written), | |
601 | }; | |
602 | } | |
603 | } | |
604 | } | |
605 | CopyResult::Ended(written) | |
606 | } | |
607 | ||
608 | #[derive(PartialEq)] | |
609 | enum SpliceMode { | |
610 | Sendfile, | |
611 | Splice, | |
612 | } | |
613 | ||
614 | /// performs splice or sendfile between file descriptors | |
615 | /// Does _not_ fall back to a generic copy loop. | |
616 | fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult { | |
617 | static HAS_SENDFILE: AtomicBool = AtomicBool::new(true); | |
618 | static HAS_SPLICE: AtomicBool = AtomicBool::new(true); | |
619 | ||
a2a8927a XL |
620 | // Android builds use feature level 14, but the libc wrapper for splice is |
621 | // gated on feature level 21+, so we have to invoke the syscall directly. | |
622 | #[cfg(target_os = "android")] | |
fc512014 XL |
623 | syscall! { |
624 | fn splice( | |
625 | srcfd: libc::c_int, | |
626 | src_offset: *const i64, | |
627 | dstfd: libc::c_int, | |
628 | dst_offset: *const i64, | |
629 | len: libc::size_t, | |
630 | flags: libc::c_int | |
631 | ) -> libc::ssize_t | |
632 | } | |
633 | ||
a2a8927a XL |
634 | #[cfg(target_os = "linux")] |
635 | use libc::splice; | |
636 | ||
fc512014 XL |
637 | match mode { |
638 | SpliceMode::Sendfile if !HAS_SENDFILE.load(Ordering::Relaxed) => { | |
639 | return CopyResult::Fallback(0); | |
640 | } | |
641 | SpliceMode::Splice if !HAS_SPLICE.load(Ordering::Relaxed) => { | |
642 | return CopyResult::Fallback(0); | |
643 | } | |
644 | _ => (), | |
645 | } | |
646 | ||
647 | let mut written = 0u64; | |
648 | while written < len { | |
649 | // according to its manpage that's the maximum size sendfile() will copy per invocation | |
650 | let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize; | |
651 | ||
652 | let result = match mode { | |
653 | SpliceMode::Sendfile => { | |
9c376795 | 654 | cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) }) |
fc512014 XL |
655 | } |
656 | SpliceMode::Splice => cvt(unsafe { | |
657 | splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0) | |
658 | }), | |
659 | }; | |
660 | ||
661 | match result { | |
662 | Ok(0) => break, // EOF | |
663 | Ok(ret) => written += ret as u64, | |
664 | Err(err) => { | |
665 | return match err.raw_os_error() { | |
5869c6ff | 666 | Some(ENOSYS | EPERM) => { |
fc512014 XL |
667 | // syscall not supported (ENOSYS) |
668 | // syscall is disallowed, e.g. by seccomp (EPERM) | |
669 | match mode { | |
670 | SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed), | |
671 | SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed), | |
672 | } | |
673 | assert_eq!(written, 0); | |
674 | CopyResult::Fallback(0) | |
675 | } | |
5869c6ff | 676 | Some(EINVAL) => { |
fc512014 XL |
677 | // splice/sendfile do not support this particular file descriptor (EINVAL) |
678 | assert_eq!(written, 0); | |
679 | CopyResult::Fallback(0) | |
680 | } | |
5869c6ff | 681 | Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => { |
fc512014 XL |
682 | CopyResult::Fallback(written) |
683 | } | |
684 | _ => CopyResult::Error(err, written), | |
685 | }; | |
686 | } | |
687 | } | |
688 | } | |
689 | CopyResult::Ended(written) | |
690 | } |