src/tools/linkchecker/main.rs

   1 //! Script to check the validity of `href` links in our HTML documentation.
   2 //!
   3 //! In the past we've been quite error prone to writing in broken links as most
   4 //! of them are manually rather than automatically added. As files move over
   5 //! time or apis change old links become stale or broken. The purpose of this
   6 //! script is to check all relative links in our documentation to make sure they
   7 //! actually point to a valid place.
   8 //!
   9 //! Currently this doesn't actually do any HTML parsing or anything fancy like
  10 //! that, it just has a simple "regex" to search for `href` and `id` tags.
  11 //! These values are then translated to file URLs if possible and then the
  12 //! destination is asserted to exist.
  13 //!
  14 //! A few exceptions are allowed as there's known bugs in rustdoc, but this
  15 //! should catch the majority of "broken link" cases.
  16
  17 use std::collections::hash_map::Entry;
  18 use std::collections::{HashMap, HashSet};
  19 use std::env;
  20 use std::fs;
  21 use std::path::{Component, Path, PathBuf};
  22 use std::rc::Rc;
  23
  24 use crate::Redirect::*;
  25
  26 // Add linkcheck exceptions here
  27 // If at all possible you should use intra-doc links to avoid linkcheck issues. These
  28 // are cases where that does not work
  29 // [(generated_documentation_page, &[broken_links])]
  30 const LINKCHECK_EXCEPTIONS: &[(&str, &[&str])] = &[
  31     // These are methods on slice, and `Self` does not work on primitive impls
  32     // in intra-doc links (primitive impls are weird)
  33     // https://github.com/rust-lang/rust/issues/62834 is necessary to be
  34     // able to link to slices
  35     (
  36         "std/io/struct.IoSlice.html",
  37         &[
  38             "#method.as_mut_ptr",
  39             "#method.sort_by_key",
  40             "#method.make_ascii_uppercase",
  41             "#method.make_ascii_lowercase",
  42             "#method.get_unchecked_mut",
  43         ],
  44     ),
  45     // These try to link to std::collections, but are defined in alloc
  46     // https://github.com/rust-lang/rust/issues/74481
  47     ("std/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
  48     ("std/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
  49     ("alloc/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
  50     ("alloc/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
  51 ];
  52
  53 macro_rules! t {
  54     ($e:expr) => {
  55         match $e {
  56             Ok(e) => e,
  57             Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
  58         }
  59     };
  60 }
  61
  62 fn main() {
  63     let docs = env::args_os().nth(1).unwrap();
  64     let docs = env::current_dir().unwrap().join(docs);
  65     let mut errors = false;
  66     walk(&mut HashMap::new(), &docs, &docs, &mut errors);
  67     if errors {
  68         panic!("found some broken links");
  69     }
  70 }
  71
  72 #[derive(Debug)]
  73 pub enum LoadError {
  74     IOError(std::io::Error),
  75     BrokenRedirect(PathBuf, std::io::Error),
  76     IsRedirect,
  77 }
  78
  79 enum Redirect {
  80     SkipRedirect,
  81     FromRedirect(bool),
  82 }
  83
  84 struct FileEntry {
  85     source: Rc<String>,
  86     ids: HashSet<String>,
  87 }
  88
  89 type Cache = HashMap<PathBuf, FileEntry>;
  90
  91 fn small_url_encode(s: &str) -> String {
  92     s.replace("<", "%3C")
  93         .replace(">", "%3E")
  94         .replace(" ", "%20")
  95         .replace("?", "%3F")
  96         .replace("'", "%27")
  97         .replace("&", "%26")
  98         .replace(",", "%2C")
  99         .replace(":", "%3A")
 100         .replace(";", "%3B")
 101         .replace("[", "%5B")
 102         .replace("]", "%5D")
 103         .replace("\"", "%22")
 104 }
 105
 106 impl FileEntry {
 107     fn parse_ids(&mut self, file: &Path, contents: &str, errors: &mut bool) {
 108         if self.ids.is_empty() {
 109             with_attrs_in_source(contents, " id", |fragment, i, _| {
 110                 let frag = fragment.trim_start_matches("#").to_owned();
 111                 let encoded = small_url_encode(&frag);
 112                 if !self.ids.insert(frag) {
 113                     *errors = true;
 114                     println!("{}:{}: id is not unique: `{}`", file.display(), i, fragment);
 115                 }
 116                 // Just in case, we also add the encoded id.
 117                 self.ids.insert(encoded);
 118             });
 119         }
 120     }
 121 }
 122
 123 fn walk(cache: &mut Cache, root: &Path, dir: &Path, errors: &mut bool) {
 124     for entry in t!(dir.read_dir()).map(|e| t!(e)) {
 125         let path = entry.path();
 126         let kind = t!(entry.file_type());
 127         if kind.is_dir() {
 128             walk(cache, root, &path, errors);
 129         } else {
 130             let pretty_path = check(cache, root, &path, errors);
 131             if let Some(pretty_path) = pretty_path {
 132                 let entry = cache.get_mut(&pretty_path).unwrap();
 133                 // we don't need the source anymore,
 134                 // so drop to reduce memory-usage
 135                 entry.source = Rc::new(String::new());
 136             }
 137         }
 138     }
 139 }
 140
 141 fn is_exception(file: &Path, link: &str) -> bool {
 142     if let Some(entry) = LINKCHECK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) {
 143         entry.1.contains(&link)
 144     } else {
 145         // FIXME(#63351): Concat trait in alloc/slice reexported in primitive page
 146         //
 147         // NOTE: This cannot be added to `LINKCHECK_EXCEPTIONS` because the resolved path
 148         // calculated in `check` function is outside `build/<triple>/doc` dir.
 149         // So the `strip_prefix` method just returns the old absolute broken path.
 150         if file.ends_with("std/primitive.slice.html") {
 151             if link.ends_with("primitive.slice.html") {
 152                 return true;
 153             }
 154         }
 155         false
 156     }
 157 }
 158
 159 fn check(cache: &mut Cache, root: &Path, file: &Path, errors: &mut bool) -> Option<PathBuf> {
 160     // Ignore non-HTML files.
 161     if file.extension().and_then(|s| s.to_str()) != Some("html") {
 162         return None;
 163     }
 164
 165     let res = load_file(cache, root, file, SkipRedirect);
 166     let (pretty_file, contents) = match res {
 167         Ok(res) => res,
 168         Err(_) => return None,
 169     };
 170     {
 171         cache.get_mut(&pretty_file).unwrap().parse_ids(&pretty_file, &contents, errors);
 172     }
 173
 174     // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
 175     with_attrs_in_source(&contents, " href", |url, i, base| {
 176         // Ignore external URLs
 177         if url.starts_with("http:")
 178             || url.starts_with("https:")
 179             || url.starts_with("javascript:")
 180             || url.starts_with("ftp:")
 181             || url.starts_with("irc:")
 182             || url.starts_with("data:")
 183         {
 184             return;
 185         }
 186         let mut parts = url.splitn(2, '#');
 187         let url = parts.next().unwrap();
 188         let fragment = parts.next();
 189         let mut parts = url.splitn(2, '?');
 190         let url = parts.next().unwrap();
 191
 192         // Once we've plucked out the URL, parse it using our base url and
 193         // then try to extract a file path.
 194         let mut path = file.to_path_buf();
 195         if !base.is_empty() || !url.is_empty() {
 196             path.pop();
 197             for part in Path::new(base).join(url).components() {
 198                 match part {
 199                     Component::Prefix(_) | Component::RootDir => {
 200                         // Avoid absolute paths as they make the docs not
 201                         // relocatable by making assumptions on where the docs
 202                         // are hosted relative to the site root.
 203                         *errors = true;
 204                         println!(
 205                             "{}:{}: absolute path - {}",
 206                             pretty_file.display(),
 207                             i + 1,
 208                             Path::new(base).join(url).display()
 209                         );
 210                         return;
 211                     }
 212                     Component::CurDir => {}
 213                     Component::ParentDir => {
 214                         path.pop();
 215                     }
 216                     Component::Normal(s) => {
 217                         path.push(s);
 218                     }
 219                 }
 220             }
 221         }
 222
 223         // Alright, if we've found a file name then this file had better
 224         // exist! If it doesn't then we register and print an error.
 225         if path.exists() {
 226             if path.is_dir() {
 227                 // Links to directories show as directory listings when viewing
 228                 // the docs offline so it's best to avoid them.
 229                 *errors = true;
 230                 let pretty_path = path.strip_prefix(root).unwrap_or(&path);
 231                 println!(
 232                     "{}:{}: directory link - {}",
 233                     pretty_file.display(),
 234                     i + 1,
 235                     pretty_path.display()
 236                 );
 237                 return;
 238             }
 239             if let Some(extension) = path.extension() {
 240                 // Ignore none HTML files.
 241                 if extension != "html" {
 242                     return;
 243                 }
 244             }
 245             let res = load_file(cache, root, &path, FromRedirect(false));
 246             let (pretty_path, contents) = match res {
 247                 Ok(res) => res,
 248                 Err(LoadError::IOError(err)) => {
 249                     panic!("error loading {}: {}", path.display(), err);
 250                 }
 251                 Err(LoadError::BrokenRedirect(target, _)) => {
 252                     *errors = true;
 253                     println!(
 254                         "{}:{}: broken redirect to {}",
 255                         pretty_file.display(),
 256                         i + 1,
 257                         target.display()
 258                     );
 259                     return;
 260                 }
 261                 Err(LoadError::IsRedirect) => unreachable!(),
 262             };
 263
 264             if let Some(ref fragment) = fragment {
 265                 // Fragments like `#1-6` are most likely line numbers to be
 266                 // interpreted by javascript, so we're ignoring these
 267                 if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
 268                     return;
 269                 }
 270
 271                 // These appear to be broken in mdbook right now?
 272                 if fragment.starts_with('-') {
 273                     return;
 274                 }
 275
 276                 let entry = &mut cache.get_mut(&pretty_path).unwrap();
 277                 entry.parse_ids(&pretty_path, &contents, errors);
 278
 279                 if !entry.ids.contains(*fragment) && !is_exception(file, &format!("#{}", fragment))
 280                 {
 281                     *errors = true;
 282                     print!("{}:{}: broken link fragment ", pretty_file.display(), i + 1);
 283                     println!("`#{}` pointing to `{}`", fragment, pretty_path.display());
 284                 };
 285             }
 286         } else {
 287             let pretty_path = path.strip_prefix(root).unwrap_or(&path);
 288             if !is_exception(file, pretty_path.to_str().unwrap()) {
 289                 *errors = true;
 290                 print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
 291                 println!("{}", pretty_path.display());
 292             }
 293         }
 294     });
 295     Some(pretty_file)
 296 }
 297
 298 fn load_file(
 299     cache: &mut Cache,
 300     root: &Path,
 301     file: &Path,
 302     redirect: Redirect,
 303 ) -> Result<(PathBuf, Rc<String>), LoadError> {
 304     let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file));
 305
 306     let (maybe_redirect, contents) = match cache.entry(pretty_file.clone()) {
 307         Entry::Occupied(entry) => (None, entry.get().source.clone()),
 308         Entry::Vacant(entry) => {
 309             let contents = match fs::read_to_string(file) {
 310                 Ok(s) => Rc::new(s),
 311                 Err(err) => {
 312                     return Err(if let FromRedirect(true) = redirect {
 313                         LoadError::BrokenRedirect(file.to_path_buf(), err)
 314                     } else {
 315                         LoadError::IOError(err)
 316                     });
 317                 }
 318             };
 319
 320             let maybe = maybe_redirect(&contents);
 321             if maybe.is_some() {
 322                 if let SkipRedirect = redirect {
 323                     return Err(LoadError::IsRedirect);
 324                 }
 325             } else {
 326                 entry.insert(FileEntry { source: contents.clone(), ids: HashSet::new() });
 327             }
 328             (maybe, contents)
 329         }
 330     };
 331     match maybe_redirect.map(|url| file.parent().unwrap().join(url)) {
 332         Some(redirect_file) => load_file(cache, root, &redirect_file, FromRedirect(true)),
 333         None => Ok((pretty_file, contents)),
 334     }
 335 }
 336
 337 fn maybe_redirect(source: &str) -> Option<String> {
 338     const REDIRECT: &str = "<p>Redirecting to <a href=";
 339
 340     let mut lines = source.lines();
 341     let redirect_line = lines.nth(6)?;
 342
 343     redirect_line.find(REDIRECT).map(|i| {
 344         let rest = &redirect_line[(i + REDIRECT.len() + 1)..];
 345         let pos_quote = rest.find('"').unwrap();
 346         rest[..pos_quote].to_owned()
 347     })
 348 }
 349
 350 fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(contents: &str, attr: &str, mut f: F) {
 351     let mut base = "";
 352     for (i, mut line) in contents.lines().enumerate() {
 353         while let Some(j) = line.find(attr) {
 354             let rest = &line[j + attr.len()..];
 355             // The base tag should always be the first link in the document so
 356             // we can get away with using one pass.
 357             let is_base = line[..j].ends_with("<base");
 358             line = rest;
 359             let pos_equals = match rest.find('=') {
 360                 Some(i) => i,
 361                 None => continue,
 362             };
 363             if rest[..pos_equals].trim_start_matches(' ') != "" {
 364                 continue;
 365             }
 366
 367             let rest = &rest[pos_equals + 1..];
 368
 369             let pos_quote = match rest.find(&['"', '\''][..]) {
 370                 Some(i) => i,
 371                 None => continue,
 372             };
 373             let quote_delim = rest.as_bytes()[pos_quote] as char;
 374
 375             if rest[..pos_quote].trim_start_matches(' ') != "" {
 376                 continue;
 377             }
 378             let rest = &rest[pos_quote + 1..];
 379             let url = match rest.find(quote_delim) {
 380                 Some(i) => &rest[..i],
 381                 None => continue,
 382             };
 383             if is_base {
 384                 base = url;
 385                 continue;
 386             }
 387             f(url, i, base)
 388         }
 389     }
 390 }