]>
Commit | Line | Data |
---|---|---|
fe692bf9 FG |
1 | //! Access to a Git index based registry. See [`RemoteRegistry`] for details. |
2 | ||
0a29b90c FG |
3 | use crate::core::{GitReference, PackageId, SourceId}; |
4 | use crate::sources::git; | |
49aad941 | 5 | use crate::sources::git::fetch::RemoteKind; |
0a29b90c FG |
6 | use crate::sources::registry::download; |
7 | use crate::sources::registry::MaybeLock; | |
8 | use crate::sources::registry::{LoadResponse, RegistryConfig, RegistryData}; | |
ed00b5ec | 9 | use crate::util::cache_lock::CacheLockMode; |
0a29b90c FG |
10 | use crate::util::errors::CargoResult; |
11 | use crate::util::interning::InternedString; | |
12 | use crate::util::{Config, Filesystem}; | |
13 | use anyhow::Context as _; | |
14 | use cargo_util::paths; | |
15 | use lazycell::LazyCell; | |
0a29b90c FG |
16 | use std::cell::{Cell, Ref, RefCell}; |
17 | use std::fs::File; | |
18 | use std::mem; | |
19 | use std::path::Path; | |
20 | use std::str; | |
21 | use std::task::{ready, Poll}; | |
add651ee | 22 | use tracing::{debug, trace}; |
0a29b90c FG |
23 | |
24 | /// A remote registry is a registry that lives at a remote URL (such as | |
25 | /// crates.io). The git index is cloned locally, and `.crate` files are | |
26 | /// downloaded as needed and cached locally. | |
fe692bf9 FG |
27 | /// |
28 | /// This type is primarily accessed through the [`RegistryData`] trait. | |
29 | /// | |
30 | /// See the [module-level documentation](super) for the index format and layout. | |
31 | /// | |
32 | /// ## History of Git-based index registry | |
33 | /// | |
34 | /// Using Git to host this index used to be quite efficient. The full index can | |
35 | /// be stored efficiently locally on disk, and once it is downloaded, all | |
36 | /// queries of a registry can happen locally and needn't touch the network. | |
37 | /// Git-based index was a reasonable design choice at the time when HTTP/2 | |
38 | /// was just introduced. | |
39 | /// | |
40 | /// However, the full index keeps growing as crates.io grows. It becomes | |
41 | /// relatively big and slows down the first use of Cargo. Git (specifically | |
42 | /// libgit2) is not efficient at handling huge amounts of small files either. | |
43 | /// On the other hand, newer protocols like HTTP/2 are prevalent and capable to | |
44 | /// serve a bunch of tiny files. Today, it is encouraged to use [`HttpRegistry`], | |
45 | /// which is the default from 1.70.0. That being said, Cargo will continue | |
46 | /// supporting Git-based index for a pretty long while. | |
47 | /// | |
48 | /// [`HttpRegistry`]: super::http_remote::HttpRegistry | |
0a29b90c | 49 | pub struct RemoteRegistry<'cfg> { |
fe692bf9 | 50 | /// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`). |
0a29b90c | 51 | index_path: Filesystem, |
fe692bf9 | 52 | /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`). |
0a29b90c | 53 | cache_path: Filesystem, |
fe692bf9 | 54 | /// The unique identifier of this registry source. |
0a29b90c | 55 | source_id: SourceId, |
fe692bf9 FG |
56 | /// This reference is stored so that when a registry needs update, it knows |
57 | /// where to fetch from. | |
0a29b90c FG |
58 | index_git_ref: GitReference, |
59 | config: &'cfg Config, | |
fe692bf9 FG |
60 | /// A Git [tree object] to help this registry find crate metadata from the |
61 | /// underlying Git repository. | |
62 | /// | |
781aab86 | 63 | /// This is stored here to prevent Git from repeatedly creating a tree object |
fe692bf9 FG |
64 | /// during each call into `load()`. |
65 | /// | |
66 | /// [tree object]: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_tree_objects | |
0a29b90c | 67 | tree: RefCell<Option<git2::Tree<'static>>>, |
fe692bf9 | 68 | /// A Git repository that contains the actual index we want. |
0a29b90c | 69 | repo: LazyCell<git2::Repository>, |
fe692bf9 | 70 | /// The current HEAD commit of the underlying Git repository. |
0a29b90c | 71 | head: Cell<Option<git2::Oid>>, |
fe692bf9 | 72 | /// This stores sha value of the current HEAD commit for convenience. |
0a29b90c | 73 | current_sha: Cell<Option<InternedString>>, |
781aab86 | 74 | /// Whether this registry needs to update package information. |
fe692bf9 FG |
75 | /// |
76 | /// See [`RemoteRegistry::mark_updated`] on how to make sure a registry | |
77 | /// index is updated only once per session. | |
78 | needs_update: bool, | |
79 | /// Disables status messages. | |
0a29b90c FG |
80 | quiet: bool, |
81 | } | |
82 | ||
83 | impl<'cfg> RemoteRegistry<'cfg> { | |
fe692bf9 FG |
84 | /// Creates a Git-rebased remote registry for `source_id`. |
85 | /// | |
86 | /// * `name` --- Name of a path segment where `.crate` tarballs and the | |
87 | /// registry index are stored. Expect to be unique. | |
0a29b90c FG |
88 | pub fn new(source_id: SourceId, config: &'cfg Config, name: &str) -> RemoteRegistry<'cfg> { |
89 | RemoteRegistry { | |
90 | index_path: config.registry_index_path().join(name), | |
91 | cache_path: config.registry_cache_path().join(name), | |
92 | source_id, | |
93 | config, | |
0a29b90c FG |
94 | index_git_ref: GitReference::DefaultBranch, |
95 | tree: RefCell::new(None), | |
96 | repo: LazyCell::new(), | |
97 | head: Cell::new(None), | |
98 | current_sha: Cell::new(None), | |
99 | needs_update: false, | |
100 | quiet: false, | |
101 | } | |
102 | } | |
103 | ||
fe692bf9 | 104 | /// Creates intermediate dirs and initialize the repository. |
0a29b90c FG |
105 | fn repo(&self) -> CargoResult<&git2::Repository> { |
106 | self.repo.try_borrow_with(|| { | |
fe692bf9 | 107 | trace!("acquiring registry index lock"); |
ed00b5ec FG |
108 | let path = self |
109 | .config | |
110 | .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path); | |
0a29b90c | 111 | |
0a29b90c FG |
112 | match git2::Repository::open(&path) { |
113 | Ok(repo) => Ok(repo), | |
114 | Err(_) => { | |
115 | drop(paths::remove_dir_all(&path)); | |
116 | paths::create_dir_all(&path)?; | |
117 | ||
118 | // Note that we'd actually prefer to use a bare repository | |
119 | // here as we're not actually going to check anything out. | |
120 | // All versions of Cargo, though, share the same CARGO_HOME, | |
121 | // so for compatibility with older Cargo which *does* do | |
122 | // checkouts we make sure to initialize a new full | |
123 | // repository (not a bare one). | |
124 | // | |
125 | // We should change this to `init_bare` whenever we feel | |
126 | // like enough time has passed or if we change the directory | |
127 | // that the folder is located in, such as by changing the | |
128 | // hash at the end of the directory. | |
129 | // | |
130 | // Note that in the meantime we also skip `init.templatedir` | |
131 | // as it can be misconfigured sometimes or otherwise add | |
132 | // things that we don't want. | |
133 | let mut opts = git2::RepositoryInitOptions::new(); | |
134 | opts.external_template(false); | |
135 | Ok(git2::Repository::init_opts(&path, &opts).with_context(|| { | |
136 | format!("failed to initialize index git repository (in {:?})", path) | |
137 | })?) | |
138 | } | |
139 | } | |
140 | }) | |
141 | } | |
142 | ||
fe692bf9 | 143 | /// Get the object ID of the HEAD commit from the underlying Git repository. |
0a29b90c FG |
144 | fn head(&self) -> CargoResult<git2::Oid> { |
145 | if self.head.get().is_none() { | |
146 | let repo = self.repo()?; | |
147 | let oid = self.index_git_ref.resolve(repo)?; | |
148 | self.head.set(Some(oid)); | |
149 | } | |
150 | Ok(self.head.get().unwrap()) | |
151 | } | |
152 | ||
fe692bf9 FG |
153 | /// Returns a [`git2::Tree`] object of the current HEAD commit of the |
154 | /// underlying Git repository. | |
0a29b90c FG |
155 | fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> { |
156 | { | |
157 | let tree = self.tree.borrow(); | |
158 | if tree.is_some() { | |
159 | return Ok(Ref::map(tree, |s| s.as_ref().unwrap())); | |
160 | } | |
161 | } | |
162 | let repo = self.repo()?; | |
163 | let commit = repo.find_commit(self.head()?)?; | |
164 | let tree = commit.tree()?; | |
165 | ||
fe692bf9 | 166 | // SAFETY: |
0a29b90c FG |
167 | // Unfortunately in libgit2 the tree objects look like they've got a |
168 | // reference to the repository object which means that a tree cannot | |
169 | // outlive the repository that it came from. Here we want to cache this | |
170 | // tree, though, so to accomplish this we transmute it to a static | |
171 | // lifetime. | |
172 | // | |
173 | // Note that we don't actually hand out the static lifetime, instead we | |
174 | // only return a scoped one from this function. Additionally the repo | |
175 | // we loaded from (above) lives as long as this object | |
176 | // (`RemoteRegistry`) so we then just need to ensure that the tree is | |
177 | // destroyed first in the destructor, hence the destructor on | |
178 | // `RemoteRegistry` below. | |
179 | let tree = unsafe { mem::transmute::<git2::Tree<'_>, git2::Tree<'static>>(tree) }; | |
180 | *self.tree.borrow_mut() = Some(tree); | |
181 | Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap())) | |
182 | } | |
183 | ||
fe692bf9 FG |
184 | /// Gets the current version of the registry index. |
185 | /// | |
186 | /// It is usually sha of the HEAD commit from the underlying Git repository. | |
0a29b90c FG |
187 | fn current_version(&self) -> Option<InternedString> { |
188 | if let Some(sha) = self.current_sha.get() { | |
189 | return Some(sha); | |
190 | } | |
191 | let sha = InternedString::new(&self.head().ok()?.to_string()); | |
192 | self.current_sha.set(Some(sha)); | |
193 | Some(sha) | |
194 | } | |
195 | ||
fe692bf9 | 196 | /// Whether the registry is up-to-date. See [`Self::mark_updated`] for more. |
0a29b90c FG |
197 | fn is_updated(&self) -> bool { |
198 | self.config.updated_sources().contains(&self.source_id) | |
199 | } | |
200 | ||
fe692bf9 FG |
201 | /// Marks this registry as up-to-date. |
202 | /// | |
203 | /// This makes sure the index is only updated once per session since it is | |
204 | /// an expensive operation. This generally only happens when the resolver | |
205 | /// is run multiple times, such as during `cargo publish`. | |
0a29b90c FG |
206 | fn mark_updated(&self) { |
207 | self.config.updated_sources().insert(self.source_id); | |
208 | } | |
209 | } | |
210 | ||
0a29b90c FG |
211 | impl<'cfg> RegistryData for RemoteRegistry<'cfg> { |
212 | fn prepare(&self) -> CargoResult<()> { | |
fe692bf9 | 213 | self.repo()?; |
0a29b90c FG |
214 | Ok(()) |
215 | } | |
216 | ||
217 | fn index_path(&self) -> &Filesystem { | |
218 | &self.index_path | |
219 | } | |
220 | ||
221 | fn assert_index_locked<'a>(&self, path: &'a Filesystem) -> &'a Path { | |
ed00b5ec FG |
222 | self.config |
223 | .assert_package_cache_locked(CacheLockMode::DownloadExclusive, path) | |
0a29b90c FG |
224 | } |
225 | ||
fe692bf9 FG |
226 | /// Read the general concept for `load()` on [`RegistryData::load`]. |
227 | /// | |
228 | /// `index_version` is a string representing the version of the file used | |
229 | /// to construct the cached copy. | |
230 | /// | |
231 | /// Older versions of Cargo used the single value of the hash of the HEAD | |
232 | /// commit as a `index_version`. This is technically correct but a little | |
233 | /// too conservative. If a new commit is fetched all cached files need to | |
234 | /// be regenerated even if a particular file was not changed. | |
235 | /// | |
236 | /// However if an old cargo has written such a file we still know how to | |
237 | /// read it, as long as we check for that hash value. | |
238 | /// | |
239 | /// Cargo now uses a hash of the file's contents as provided by git. | |
0a29b90c FG |
240 | fn load( |
241 | &mut self, | |
242 | _root: &Path, | |
243 | path: &Path, | |
244 | index_version: Option<&str>, | |
245 | ) -> Poll<CargoResult<LoadResponse>> { | |
246 | if self.needs_update { | |
247 | return Poll::Pending; | |
248 | } | |
249 | // Check if the cache is valid. | |
250 | let git_commit_hash = self.current_version(); | |
251 | if index_version.is_some() && index_version == git_commit_hash.as_deref() { | |
fe692bf9 FG |
252 | // This file was written by an old version of cargo, but it is |
253 | // still up-to-date. | |
0a29b90c FG |
254 | return Poll::Ready(Ok(LoadResponse::CacheValid)); |
255 | } | |
256 | // Note that the index calls this method and the filesystem is locked | |
257 | // in the index, so we don't need to worry about an `update_index` | |
258 | // happening in a different process. | |
259 | fn load_helper( | |
260 | registry: &RemoteRegistry<'_>, | |
261 | path: &Path, | |
262 | index_version: Option<&str>, | |
263 | ) -> CargoResult<LoadResponse> { | |
264 | let repo = registry.repo()?; | |
265 | let tree = registry.tree()?; | |
266 | let entry = tree.get_path(path); | |
267 | let entry = entry?; | |
268 | let git_file_hash = Some(entry.id().to_string()); | |
269 | ||
270 | // Check if the cache is valid. | |
271 | if index_version.is_some() && index_version == git_file_hash.as_deref() { | |
272 | return Ok(LoadResponse::CacheValid); | |
273 | } | |
274 | ||
275 | let object = entry.to_object(repo)?; | |
781aab86 FG |
276 | let Some(blob) = object.as_blob() else { |
277 | anyhow::bail!("path `{}` is not a blob in the git repo", path.display()) | |
0a29b90c FG |
278 | }; |
279 | ||
280 | Ok(LoadResponse::Data { | |
281 | raw_data: blob.content().to_vec(), | |
282 | index_version: git_file_hash, | |
283 | }) | |
284 | } | |
285 | ||
286 | match load_helper(&self, path, index_version) { | |
287 | Ok(result) => Poll::Ready(Ok(result)), | |
288 | Err(_) if !self.is_updated() => { | |
fe692bf9 FG |
289 | // If git returns an error and we haven't updated the repo, |
290 | // return pending to allow an update to try again. | |
0a29b90c FG |
291 | self.needs_update = true; |
292 | Poll::Pending | |
293 | } | |
294 | Err(e) | |
295 | if e.downcast_ref::<git2::Error>() | |
296 | .map(|e| e.code() == git2::ErrorCode::NotFound) | |
297 | .unwrap_or_default() => | |
298 | { | |
299 | // The repo has been updated and the file does not exist. | |
300 | Poll::Ready(Ok(LoadResponse::NotFound)) | |
301 | } | |
302 | Err(e) => Poll::Ready(Err(e)), | |
303 | } | |
304 | } | |
305 | ||
306 | fn config(&mut self) -> Poll<CargoResult<Option<RegistryConfig>>> { | |
307 | debug!("loading config"); | |
308 | self.prepare()?; | |
ed00b5ec FG |
309 | self.config |
310 | .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path); | |
fe692bf9 | 311 | match ready!(self.load(Path::new(""), Path::new(RegistryConfig::NAME), None)?) { |
0a29b90c FG |
312 | LoadResponse::Data { raw_data, .. } => { |
313 | trace!("config loaded"); | |
781aab86 | 314 | let cfg: RegistryConfig = serde_json::from_slice(&raw_data)?; |
0a29b90c FG |
315 | Poll::Ready(Ok(Some(cfg))) |
316 | } | |
317 | _ => Poll::Ready(Ok(None)), | |
318 | } | |
319 | } | |
320 | ||
321 | fn block_until_ready(&mut self) -> CargoResult<()> { | |
322 | if !self.needs_update { | |
323 | return Ok(()); | |
324 | } | |
325 | ||
326 | self.needs_update = false; | |
327 | ||
0a29b90c FG |
328 | if self.is_updated() { |
329 | return Ok(()); | |
330 | } | |
331 | self.mark_updated(); | |
332 | ||
333 | if self.config.offline() { | |
334 | return Ok(()); | |
335 | } | |
336 | if self.config.cli_unstable().no_index_update { | |
337 | return Ok(()); | |
338 | } | |
339 | ||
340 | debug!("updating the index"); | |
341 | ||
342 | // Ensure that we'll actually be able to acquire an HTTP handle later on | |
343 | // once we start trying to download crates. This will weed out any | |
344 | // problems with `.cargo/config` configuration related to HTTP. | |
345 | // | |
346 | // This way if there's a problem the error gets printed before we even | |
347 | // hit the index, which may not actually read this configuration. | |
348 | self.config.http()?; | |
349 | ||
350 | self.prepare()?; | |
351 | self.head.set(None); | |
352 | *self.tree.borrow_mut() = None; | |
353 | self.current_sha.set(None); | |
ed00b5ec FG |
354 | let _path = self |
355 | .config | |
356 | .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &self.index_path); | |
0a29b90c FG |
357 | if !self.quiet { |
358 | self.config | |
359 | .shell() | |
360 | .status("Updating", self.source_id.display_index())?; | |
361 | } | |
362 | ||
363 | // Fetch the latest version of our `index_git_ref` into the index | |
364 | // checkout. | |
365 | let url = self.source_id.url(); | |
366 | let repo = self.repo.borrow_mut().unwrap(); | |
49aad941 FG |
367 | git::fetch( |
368 | repo, | |
369 | url.as_str(), | |
370 | &self.index_git_ref, | |
371 | self.config, | |
372 | RemoteKind::Registry, | |
373 | ) | |
374 | .with_context(|| format!("failed to fetch `{}`", url))?; | |
0a29b90c | 375 | |
0a29b90c FG |
376 | Ok(()) |
377 | } | |
378 | ||
fe692bf9 FG |
379 | /// Read the general concept for `invalidate_cache()` on |
380 | /// [`RegistryData::invalidate_cache`]. | |
381 | /// | |
382 | /// To fully invalidate, undo [`RemoteRegistry::mark_updated`]'s work. | |
0a29b90c | 383 | fn invalidate_cache(&mut self) { |
0a29b90c FG |
384 | self.needs_update = true; |
385 | } | |
386 | ||
387 | fn set_quiet(&mut self, quiet: bool) { | |
388 | self.quiet = quiet; | |
389 | } | |
390 | ||
391 | fn is_updated(&self) -> bool { | |
392 | self.is_updated() | |
393 | } | |
394 | ||
395 | fn download(&mut self, pkg: PackageId, checksum: &str) -> CargoResult<MaybeLock> { | |
396 | let registry_config = loop { | |
397 | match self.config()? { | |
398 | Poll::Pending => self.block_until_ready()?, | |
399 | Poll::Ready(cfg) => break cfg.unwrap(), | |
400 | } | |
401 | }; | |
402 | ||
403 | download::download( | |
404 | &self.cache_path, | |
405 | &self.config, | |
406 | pkg, | |
407 | checksum, | |
408 | registry_config, | |
409 | ) | |
410 | } | |
411 | ||
412 | fn finish_download( | |
413 | &mut self, | |
414 | pkg: PackageId, | |
415 | checksum: &str, | |
416 | data: &[u8], | |
417 | ) -> CargoResult<File> { | |
418 | download::finish_download(&self.cache_path, &self.config, pkg, checksum, data) | |
419 | } | |
420 | ||
421 | fn is_crate_downloaded(&self, pkg: PackageId) -> bool { | |
422 | download::is_crate_downloaded(&self.cache_path, &self.config, pkg) | |
423 | } | |
424 | } | |
425 | ||
fe692bf9 FG |
426 | /// Implemented to just be sure to drop `tree` field before our other fields. |
427 | /// See SAFETY inside [`RemoteRegistry::tree()`] for more. | |
0a29b90c FG |
428 | impl<'cfg> Drop for RemoteRegistry<'cfg> { |
429 | fn drop(&mut self) { | |
0a29b90c FG |
430 | self.tree.borrow_mut().take(); |
431 | } | |
432 | } |