]> git.proxmox.com Git - cargo.git/commitdiff
Don't check out the crates.io index locally
authorAlex Crichton <alex@alexcrichton.com>
Wed, 10 May 2017 22:00:33 +0000 (15:00 -0700)
committerAlex Crichton <alex@alexcrichton.com>
Thu, 11 May 2017 22:05:33 +0000 (15:05 -0700)
This commit moves working with the crates.io index to operating on the git
object layers rather than actually literally checking out the index. This is
aimed at two different goals:

* Improving the on-disk file size of the registry
* Improving cloning times for the registry as the index doesn't need to be
  checked out

The on disk size of my `registry` folder of a fresh check out of the index went
form 124M to 48M, saving a good chunk of space! The entire operation took about
0.6s less on a Unix machine (out of 4.7s total for current Cargo). On Windows,
however, the clone operation went from 11s to 6.7s, a much larger improvement!

Closes #4015

src/cargo/sources/registry/index.rs
src/cargo/sources/registry/local.rs
src/cargo/sources/registry/mod.rs
src/cargo/sources/registry/remote.rs

index bbd7e226ca7ef0ca80a1680ce0e24a7434a3d612..026f20204a9166005117f299d9c2e8938dfbb1b8 100644 (file)
@@ -1,14 +1,15 @@
 use std::collections::HashMap;
-use std::io::prelude::*;
-use std::fs::File;
 use std::path::Path;
+use std::str;
 
 use serde_json;
 
 use core::dependency::{Dependency, DependencyInner, Kind};
 use core::{SourceId, Summary, PackageId, Registry};
 use sources::registry::{RegistryPackage, RegistryDependency, INDEX_LOCK};
+use sources::registry::RegistryData;
 use util::{CargoResult, ChainError, internal, Filesystem, Config};
+use util::human;
 
 pub struct RegistryIndex<'cfg> {
     source_id: SourceId,
@@ -23,7 +24,8 @@ impl<'cfg> RegistryIndex<'cfg> {
     pub fn new(id: &SourceId,
                path: &Filesystem,
                config: &'cfg Config,
-               locked: bool) -> RegistryIndex<'cfg> {
+               locked: bool)
+               -> RegistryIndex<'cfg> {
         RegistryIndex {
             source_id: id.clone(),
             path: path.clone(),
@@ -35,13 +37,16 @@ impl<'cfg> RegistryIndex<'cfg> {
     }
 
     /// Return the hash listed for a specified PackageId.
-    pub fn hash(&mut self, pkg: &PackageId) -> CargoResult<String> {
+    pub fn hash(&mut self,
+                pkg: &PackageId,
+                load: &mut RegistryData)
+                -> CargoResult<String> {
         let key = (pkg.name().to_string(), pkg.version().to_string());
         if let Some(s) = self.hashes.get(&key) {
             return Ok(s.clone())
         }
         // Ok, we're missing the key, so parse the index file to load it.
-        self.summaries(pkg.name())?;
+        self.summaries(pkg.name(), load)?;
         self.hashes.get(&key).chain_error(|| {
             internal(format!("no hash listed for {}", pkg))
         }).map(|s| s.clone())
@@ -51,11 +56,14 @@ impl<'cfg> RegistryIndex<'cfg> {
     ///
     /// Returns a list of pairs of (summary, yanked) for the package name
     /// specified.
-    pub fn summaries(&mut self, name: &str) -> CargoResult<&Vec<(Summary, bool)>> {
+    pub fn summaries(&mut self,
+                     name: &str,
+                     load: &mut RegistryData)
+                     -> CargoResult<&Vec<(Summary, bool)>> {
         if self.cache.contains_key(name) {
             return Ok(&self.cache[name]);
         }
-        let summaries = self.load_summaries(name)?;
+        let summaries = self.load_summaries(name, load)?;
         let summaries = summaries.into_iter().filter(|summary| {
             summary.0.package_id().name() == name
         }).collect();
@@ -63,8 +71,11 @@ impl<'cfg> RegistryIndex<'cfg> {
         Ok(&self.cache[name])
     }
 
-    fn load_summaries(&mut self, name: &str) -> CargoResult<Vec<(Summary, bool)>> {
-        let (path, _lock) = if self.locked {
+    fn load_summaries(&mut self,
+                      name: &str,
+                      load: &mut RegistryData)
+                      -> CargoResult<Vec<(Summary, bool)>> {
+        let (root, _lock) = if self.locked {
             let lock = self.path.open_ro(Path::new(INDEX_LOCK),
                                          self.config,
                                          "the registry index");
@@ -84,25 +95,32 @@ impl<'cfg> RegistryIndex<'cfg> {
 
         // see module comment for why this is structured the way it is
         let path = match fs_name.len() {
-            1 => path.join("1").join(&fs_name),
-            2 => path.join("2").join(&fs_name),
-            3 => path.join("3").join(&fs_name[..1]).join(&fs_name),
-            _ => path.join(&fs_name[0..2])
-                     .join(&fs_name[2..4])
-                     .join(&fs_name),
+            1 => format!("1/{}", fs_name),
+            2 => format!("2/{}", fs_name),
+            3 => format!("3/{}/{}", &fs_name[..1], fs_name),
+            _ => format!("{}/{}/{}", &fs_name[0..2], &fs_name[2..4], fs_name),
+            // 1 => Path::new("1").join(fs_name),
+            // 2 => Path::new("2").join(fs_name),
+            // 3 => Path::new("3").join(&fs_name[..1]).join(fs_name),
+            // _ => Path::new(&fs_name[0..2]).join(&fs_name[2..4]).join(fs_name),
         };
-        match File::open(&path) {
-            Ok(mut f) => {
-                let mut contents = String::new();
-                f.read_to_string(&mut contents)?;
-                let ret: CargoResult<Vec<(Summary, bool)>>;
-                ret = contents.lines().filter(|l| !l.trim().is_empty())
-                              .map(|l| self.parse_registry_package(l))
-                              .collect();
-                ret.chain_error(|| {
-                    internal(format!("failed to parse registry's information \
-                                      for: {}", name))
-                })
+        match load.load(&root, Path::new(&path)) {
+            Ok(contents) => {
+                let contents = str::from_utf8(&contents).map_err(|_| {
+                    human("registry index file was not valid utf-8")
+                })?;
+                let lines = contents.lines()
+                                    .map(|s| s.trim())
+                                    .filter(|l| !l.is_empty());
+
+                // Attempt forwards-compatibility on the index by ignoring
+                // everything that we ourselves don't understand, that should
+                // allow future cargo implementations to break the
+                // interpretation of each line here and older cargo will simply
+                // ignore the new lines.
+                Ok(lines.filter_map(|line| {
+                    self.parse_registry_package(line).ok()
+                }).collect())
             }
             Err(..) => Ok(Vec::new()),
         }
@@ -161,12 +179,13 @@ impl<'cfg> RegistryIndex<'cfg> {
               .set_kind(kind)
               .into_dependency())
     }
-}
 
-impl<'cfg> Registry for RegistryIndex<'cfg> {
-    fn query(&mut self, dep: &Dependency) -> CargoResult<Vec<Summary>> {
+    pub fn query(&mut self,
+                 dep: &Dependency,
+                 load: &mut RegistryData)
+                 -> CargoResult<Vec<Summary>> {
         let mut summaries = {
-            let summaries = self.summaries(dep.name())?;
+            let summaries = self.summaries(dep.name(), load)?;
             summaries.iter().filter(|&&(_, yanked)| {
                 dep.source_id().precise().is_some() || !yanked
             }).map(|s| s.0.clone()).collect::<Vec<_>>()
@@ -188,8 +207,4 @@ impl<'cfg> Registry for RegistryIndex<'cfg> {
         });
         summaries.query(dep)
     }
-
-    fn supports_checksums(&self) -> bool {
-        true
-    }
 }
index 67e88a9c72df829339c2202e5668adf2e5a8fac6..fa4cbcd82d1f3a87ef33fc3fcc720cc9bf3b3ca5 100644 (file)
@@ -6,8 +6,9 @@ use rustc_serialize::hex::ToHex;
 
 use core::PackageId;
 use sources::registry::{RegistryData, RegistryConfig};
-use util::{Config, CargoResult, ChainError, human, Sha256, Filesystem};
 use util::FileLock;
+use util::paths;
+use util::{Config, CargoResult, ChainError, human, Sha256, Filesystem};
 
 pub struct LocalRegistry<'cfg> {
     index_path: Filesystem,
@@ -34,7 +35,11 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> {
         &self.index_path
     }
 
-    fn config(&self) -> CargoResult<Option<RegistryConfig>> {
+    fn load(&self, root: &Path, path: &Path) -> CargoResult<Vec<u8>> {
+        paths::read_bytes(&root.join(path))
+    }
+
+    fn config(&mut self) -> CargoResult<Option<RegistryConfig>> {
         // Local registries don't have configuration for remote APIs or anything
         // like that
         Ok(None)
index 2044c30ebfed50a37ced5f04c9666e30778e8e55..2af4ad9719f6f468e96c1e65cdb8a0d0108c95a9 100644 (file)
@@ -219,7 +219,8 @@ struct RegistryDependency {
 
 pub trait RegistryData {
     fn index_path(&self) -> &Filesystem;
-    fn config(&self) -> CargoResult<Option<RegistryConfig>>;
+    fn load(&self, root: &Path, path: &Path) -> CargoResult<Vec<u8>>;
+    fn config(&mut self) -> CargoResult<Option<RegistryConfig>>;
     fn update_index(&mut self) -> CargoResult<()>;
     fn download(&mut self,
                 pkg: &PackageId,
@@ -274,7 +275,7 @@ impl<'cfg> RegistrySource<'cfg> {
     /// Decode the configuration stored within the registry.
     ///
     /// This requires that the index has been at least checked out.
-    pub fn config(&self) -> CargoResult<Option<RegistryConfig>> {
+    pub fn config(&mut self) -> CargoResult<Option<RegistryConfig>> {
         self.ops.config()
     }
 
@@ -323,12 +324,12 @@ impl<'cfg> Registry for RegistrySource<'cfg> {
         // come back with no summaries, then our registry may need to be
         // updated, so we fall back to performing a lazy update.
         if dep.source_id().precise().is_some() && !self.updated {
-            if self.index.query(dep)?.is_empty() {
+            if self.index.query(dep, &mut *self.ops)?.is_empty() {
                 self.do_update()?;
             }
         }
 
-        self.index.query(dep)
+        self.index.query(dep, &mut *self.ops)
     }
 
     fn supports_checksums(&self) -> bool {
@@ -356,7 +357,7 @@ impl<'cfg> Source for RegistrySource<'cfg> {
     }
 
     fn download(&mut self, package: &PackageId) -> CargoResult<Package> {
-        let hash = self.index.hash(package)?;
+        let hash = self.index.hash(package, &mut *self.ops)?;
         let path = self.ops.download(package, &hash)?;
         let path = self.unpack_package(package, &path).chain_error(|| {
             internal(format!("failed to unpack package `{}`", package))
@@ -369,7 +370,7 @@ impl<'cfg> Source for RegistrySource<'cfg> {
         // differ due to historical Cargo bugs. To paper over these we trash the
         // *summary* loaded from the Cargo.toml we just downloaded with the one
         // we loaded from the index.
-        let summaries = self.index.summaries(package.name())?;
+        let summaries = self.index.summaries(package.name(), &mut *self.ops)?;
         let summary = summaries.iter().map(|s| &s.0).find(|s| {
             s.package_id() == package
         }).expect("summary not found");
index 7408eb450606eca9f5b8d95e10e62d38a9a04d87..78c9cb5b5fa75ad16e0acb1d232d311c15bc0df6 100644 (file)
@@ -1,5 +1,7 @@
+use std::cell::{RefCell, Ref, Cell};
 use std::io::SeekFrom;
 use std::io::prelude::*;
+use std::mem;
 use std::path::Path;
 
 use curl::easy::{Easy, List};
@@ -13,8 +15,7 @@ use ops;
 use sources::git;
 use sources::registry::{RegistryData, RegistryConfig, INDEX_LOCK};
 use util::network;
-use util::paths;
-use util::{FileLock, Filesystem};
+use util::{FileLock, Filesystem, LazyCell};
 use util::{Config, CargoResult, ChainError, human, Sha256, ToUrl};
 use util::errors::HttpError;
 
@@ -23,7 +24,10 @@ pub struct RemoteRegistry<'cfg> {
     cache_path: Filesystem,
     source_id: SourceId,
     config: &'cfg Config,
-    handle: Option<Easy>,
+    handle: LazyCell<RefCell<Easy>>,
+    tree: RefCell<Option<git2::Tree<'static>>>,
+    repo: LazyCell<git2::Repository>,
+    head: Cell<Option<git2::Oid>>,
 }
 
 impl<'cfg> RemoteRegistry<'cfg> {
@@ -34,9 +38,74 @@ impl<'cfg> RemoteRegistry<'cfg> {
             cache_path: config.registry_cache_path().join(name),
             source_id: source_id.clone(),
             config: config,
-            handle: None,
+            tree: RefCell::new(None),
+            handle: LazyCell::new(),
+            repo: LazyCell::new(),
+            head: Cell::new(None),
         }
     }
+
+    fn easy(&self) -> CargoResult<&RefCell<Easy>> {
+        self.handle.get_or_try_init(|| {
+            ops::http_handle(self.config).map(RefCell::new)
+        })
+    }
+
+    fn repo(&self) -> CargoResult<&git2::Repository> {
+        self.repo.get_or_try_init(|| {
+            let path = self.index_path.clone().into_path_unlocked();
+
+            match git2::Repository::open(&path) {
+                Ok(repo) => Ok(repo),
+                Err(_) => {
+                    self.index_path.create_dir()?;
+                    let lock = self.index_path.open_rw(Path::new(INDEX_LOCK),
+                                                       self.config,
+                                                       "the registry index")?;
+                    let _ = lock.remove_siblings();
+                    Ok(git2::Repository::init_bare(&path)?)
+                }
+            }
+        })
+    }
+
+    fn head(&self) -> CargoResult<git2::Oid> {
+        if self.head.get().is_none() {
+            let oid = self.repo()?.refname_to_id("refs/remotes/origin/master")?;
+            self.head.set(Some(oid));
+        }
+        Ok(self.head.get().unwrap())
+    }
+
+    fn tree(&self) -> CargoResult<Ref<git2::Tree>> {
+        {
+            let tree = self.tree.borrow();
+            if tree.is_some() {
+                return Ok(Ref::map(tree, |s| s.as_ref().unwrap()))
+            }
+        }
+        let repo = self.repo()?;
+        let commit = repo.find_commit(self.head()?)?;
+        let tree = commit.tree()?;
+
+        // Unfortunately in libgit2 the tree objects look like they've got a
+        // reference to the repository object which means that a tree cannot
+        // outlive the repository that it came from. Here we want to cache this
+        // tree, though, so to accomplish this we transmute it to a static
+        // lifetime.
+        //
+        // Note that we don't actually hand out the static lifetime, instead we
+        // only return a scoped one from this function. Additionally the repo
+        // we loaded from (above) lives as long as this object
+        // (`RemoteRegistry`) so we then just need to ensure that the tree is
+        // destroyed first in the destructor, hence the destructor on
+        // `RemoteRegistry` below.
+        let tree = unsafe {
+            mem::transmute::<git2::Tree, git2::Tree<'static>>(tree)
+        };
+        *self.tree.borrow_mut() = Some(tree);
+        Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap()))
+    }
 }
 
 impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
@@ -44,13 +113,28 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
         &self.index_path
     }
 
-    fn config(&self) -> CargoResult<Option<RegistryConfig>> {
-        let lock = self.index_path.open_ro(Path::new(INDEX_LOCK),
-                                           self.config,
-                                           "the registry index")?;
-        let path = lock.path().parent().unwrap();
-        let contents = paths::read(&path.join("config.json"))?;
-        let config = serde_json::from_str(&contents)?;
+    fn load(&self, _root: &Path, path: &Path) -> CargoResult<Vec<u8>> {
+        // Note that the index calls this method and the filesystem is locked
+        // in the index, so we don't need to worry about an `update_index`
+        // happening in a different process.
+        let repo = self.repo()?;
+        let tree = self.tree()?;
+        let entry = tree.get_path(path)?;
+        let object = entry.to_object(&repo)?;
+        let blob = match object.as_blob() {
+            Some(blob) => blob,
+            None => bail!("path `{}` is not a blob in the git repo", path.display()),
+        };
+        Ok(blob.content().to_vec())
+    }
+
+    fn config(&mut self) -> CargoResult<Option<RegistryConfig>> {
+        self.repo()?; // create intermediate dirs and initialize the repo
+        let _lock = self.index_path.open_ro(Path::new(INDEX_LOCK),
+                                            self.config,
+                                            "the registry index")?;
+        let json = self.load(Path::new(""), Path::new("config.json"))?;
+        let config = serde_json::from_slice(&json)?;
         Ok(Some(config))
     }
 
@@ -63,53 +147,33 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
         // hit the index, which may not actually read this configuration.
         ops::http_handle(self.config)?;
 
-        // Then we actually update the index
-        self.index_path.create_dir()?;
-        let lock = self.index_path.open_rw(Path::new(INDEX_LOCK),
-                                           self.config,
-                                           "the registry index")?;
-        let path = lock.path().parent().unwrap();
-
+        let repo = self.repo()?;
+        let _lock = self.index_path.open_rw(Path::new(INDEX_LOCK),
+                                            self.config,
+                                            "the registry index")?;
         self.config.shell().status("Updating",
              format!("registry `{}`", self.source_id.url()))?;
 
-        let repo = git2::Repository::open(path).or_else(|_| {
-            let _ = lock.remove_siblings();
-            git2::Repository::init(path)
-        })?;
-
         if self.source_id.url().host_str() == Some("github.com") {
-            if let Ok(oid) = repo.refname_to_id("refs/heads/master") {
-                let handle = match self.handle {
-                    Some(ref mut handle) => handle,
-                    None => {
-                        self.handle = Some(ops::http_handle(self.config)?);
-                        self.handle.as_mut().unwrap()
-                    }
-                };
+            if let Ok(oid) = self.head() {
+                let mut handle = self.easy()?.borrow_mut();
                 debug!("attempting github fast path for {}",
                        self.source_id.url());
-                if github_up_to_date(handle, self.source_id.url(), &oid) {
+                if github_up_to_date(&mut handle, self.source_id.url(), &oid) {
                     return Ok(())
                 }
                 debug!("fast path failed, falling back to a git fetch");
             }
         }
 
-        // git fetch origin
+        // git fetch origin master
         let url = self.source_id.url().to_string();
         let refspec = "refs/heads/master:refs/remotes/origin/master";
-
         git::fetch(&repo, &url, refspec, self.config).chain_error(|| {
             human(format!("failed to fetch `{}`", url))
         })?;
-
-        // git reset --hard origin/master
-        let reference = "refs/remotes/origin/master";
-        let oid = repo.refname_to_id(reference)?;
-        trace!("[{}] updating to rev {}", self.source_id, oid);
-        let object = repo.find_object(oid, None)?;
-        repo.reset(&object, git2::ResetType::Hard, None)?;
+        self.head.set(None);
+        *self.tree.borrow_mut() = None;
         Ok(())
     }
 
@@ -144,17 +208,11 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
             .push(&pkg.version().to_string())
             .push("download");
 
-        let handle = match self.handle {
-            Some(ref mut handle) => handle,
-            None => {
-                self.handle = Some(ops::http_handle(self.config)?);
-                self.handle.as_mut().unwrap()
-            }
-        };
         // TODO: don't download into memory, but ensure that if we ctrl-c a
         //       download we should resume either from the start or the middle
         //       on the next time
         let url = url.to_string();
+        let mut handle = self.easy()?.borrow_mut();
         handle.get(true)?;
         handle.url(&url)?;
         handle.follow_location(true)?;
@@ -192,6 +250,13 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
     }
 }
 
+impl<'cfg> Drop for RemoteRegistry<'cfg> {
+    fn drop(&mut self) {
+        // Just be sure to drop this before our other fields
+        self.tree.borrow_mut().take();
+    }
+}
+
 /// Updating the index is done pretty regularly so we want it to be as fast as
 /// possible. For registries hosted on github (like the crates.io index) there's
 /// a fast path available to use [1] to tell us that there's no updates to be