]> git.proxmox.com Git - rustc.git/blame - src/tools/linkchecker/main.rs
New upstream version 1.48.0+dfsg1
[rustc.git] / src / tools / linkchecker / main.rs
CommitLineData
54a0048b
SL
1//! Script to check the validity of `href` links in our HTML documentation.
2//!
3//! In the past we've been quite error prone to writing in broken links as most
4//! of them are manually rather than automatically added. As files move over
5//! time or apis change old links become stale or broken. The purpose of this
6//! script is to check all relative links in our documentation to make sure they
7//! actually point to a valid place.
8//!
9//! Currently this doesn't actually do any HTML parsing or anything fancy like
10//! that, it just has a simple "regex" to search for `href` and `id` tags.
11//! These values are then translated to file URLs if possible and then the
12//! destination is asserted to exist.
13//!
f035d41b
XL
14//! A few exceptions are allowed as there's known bugs in rustdoc, but this
15//! should catch the majority of "broken link" cases.
e1599b0c 16
a1dfa0c6
XL
17use std::collections::hash_map::Entry;
18use std::collections::{HashMap, HashSet};
54a0048b 19use std::env;
a1dfa0c6 20use std::fs;
dfeec247 21use std::path::{Component, Path, PathBuf};
a1dfa0c6 22use std::rc::Rc;
54a0048b 23
9fa01778 24use crate::Redirect::*;
54a0048b 25
3dfed10e
XL
26// Add linkcheck exceptions here
27// If at all possible you should use intra-doc links to avoid linkcheck issues. These
28// are cases where that does not work
29// [(generated_documentation_page, &[broken_links])]
30const LINKCHECK_EXCEPTIONS: &[(&str, &[&str])] = &[
31 // These are methods on slice, and `Self` does not work on primitive impls
32 // in intra-doc links (primitive impls are weird)
33 // https://github.com/rust-lang/rust/issues/62834 is necessary to be
34 // able to link to slices
35 (
36 "std/io/struct.IoSlice.html",
37 &[
38 "#method.as_mut_ptr",
39 "#method.sort_by_key",
40 "#method.make_ascii_uppercase",
41 "#method.make_ascii_lowercase",
1b1a35ee 42 "#method.get_unchecked_mut",
3dfed10e
XL
43 ],
44 ),
45 // These try to link to std::collections, but are defined in alloc
46 // https://github.com/rust-lang/rust/issues/74481
47 ("std/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
48 ("std/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
49 ("alloc/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
50 ("alloc/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
51];
52
54a0048b 53macro_rules! t {
dfeec247
XL
54 ($e:expr) => {
55 match $e {
56 Ok(e) => e,
57 Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
58 }
59 };
54a0048b
SL
60}
61
62fn main() {
7cac9316 63 let docs = env::args_os().nth(1).unwrap();
54a0048b 64 let docs = env::current_dir().unwrap().join(docs);
54a0048b 65 let mut errors = false;
476ff2be 66 walk(&mut HashMap::new(), &docs, &docs, &mut errors);
54a0048b
SL
67 if errors {
68 panic!("found some broken links");
69 }
70}
71
72#[derive(Debug)]
73pub enum LoadError {
74 IOError(std::io::Error),
75 BrokenRedirect(PathBuf, std::io::Error),
76 IsRedirect,
77}
78
79enum Redirect {
80 SkipRedirect,
81 FromRedirect(bool),
82}
83
84struct FileEntry {
a1dfa0c6 85 source: Rc<String>,
54a0048b
SL
86 ids: HashSet<String>,
87}
88
89type Cache = HashMap<PathBuf, FileEntry>;
90
abe05a73
XL
91fn small_url_encode(s: &str) -> String {
92 s.replace("<", "%3C")
dfeec247
XL
93 .replace(">", "%3E")
94 .replace(" ", "%20")
95 .replace("?", "%3F")
96 .replace("'", "%27")
97 .replace("&", "%26")
98 .replace(",", "%2C")
99 .replace(":", "%3A")
100 .replace(";", "%3B")
101 .replace("[", "%5B")
102 .replace("]", "%5D")
103 .replace("\"", "%22")
abe05a73
XL
104}
105
54a0048b 106impl FileEntry {
3157f602 107 fn parse_ids(&mut self, file: &Path, contents: &str, errors: &mut bool) {
54a0048b 108 if self.ids.is_empty() {
7cac9316 109 with_attrs_in_source(contents, " id", |fragment, i, _| {
9fa01778 110 let frag = fragment.trim_start_matches("#").to_owned();
abe05a73 111 let encoded = small_url_encode(&frag);
54a0048b
SL
112 if !self.ids.insert(frag) {
113 *errors = true;
3157f602 114 println!("{}:{}: id is not unique: `{}`", file.display(), i, fragment);
54a0048b 115 }
abe05a73
XL
116 // Just in case, we also add the encoded id.
117 self.ids.insert(encoded);
54a0048b
SL
118 });
119 }
120 }
121}
122
476ff2be 123fn walk(cache: &mut Cache, root: &Path, dir: &Path, errors: &mut bool) {
54a0048b
SL
124 for entry in t!(dir.read_dir()).map(|e| t!(e)) {
125 let path = entry.path();
126 let kind = t!(entry.file_type());
54a0048b 127 if kind.is_dir() {
476ff2be 128 walk(cache, root, &path, errors);
54a0048b 129 } else {
476ff2be 130 let pretty_path = check(cache, root, &path, errors);
54a0048b
SL
131 if let Some(pretty_path) = pretty_path {
132 let entry = cache.get_mut(&pretty_path).unwrap();
133 // we don't need the source anymore,
a7813a04 134 // so drop to reduce memory-usage
a1dfa0c6 135 entry.source = Rc::new(String::new());
54a0048b
SL
136 }
137 }
54a0048b
SL
138 }
139}
140
3dfed10e
XL
141fn is_exception(file: &Path, link: &str) -> bool {
142 if let Some(entry) = LINKCHECK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) {
143 entry.1.contains(&link)
144 } else {
6c58768f
XL
145 // FIXME(#63351): Concat trait in alloc/slice reexported in primitive page
146 //
147 // NOTE: This cannot be added to `LINKCHECK_EXCEPTIONS` because the resolved path
148 // calculated in `check` function is outside `build/<triple>/doc` dir.
149 // So the `strip_prefix` method just returns the old absolute broken path.
150 if file.ends_with("std/primitive.slice.html") {
151 if link.ends_with("primitive.slice.html") {
152 return true;
153 }
154 }
3dfed10e
XL
155 false
156 }
157}
158
dfeec247 159fn check(cache: &mut Cache, root: &Path, file: &Path, errors: &mut bool) -> Option<PathBuf> {
f9f354fc 160 // Ignore non-HTML files.
7cac9316 161 if file.extension().and_then(|s| s.to_str()) != Some("html") {
cc61c64b
XL
162 return None;
163 }
164
7cac9316 165 let res = load_file(cache, root, file, SkipRedirect);
54a0048b
SL
166 let (pretty_file, contents) = match res {
167 Ok(res) => res,
168 Err(_) => return None,
169 };
170 {
dfeec247 171 cache.get_mut(&pretty_file).unwrap().parse_ids(&pretty_file, &contents, errors);
54a0048b
SL
172 }
173
174 // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
7cac9316 175 with_attrs_in_source(&contents, " href", |url, i, base| {
3157f602 176 // Ignore external URLs
dfeec247
XL
177 if url.starts_with("http:")
178 || url.starts_with("https:")
179 || url.starts_with("javascript:")
180 || url.starts_with("ftp:")
181 || url.starts_with("irc:")
182 || url.starts_with("data:")
183 {
3157f602
XL
184 return;
185 }
3dfed10e 186 let mut parts = url.splitn(2, '#');
476ff2be 187 let url = parts.next().unwrap();
476ff2be 188 let fragment = parts.next();
3dfed10e 189 let mut parts = url.splitn(2, '?');
476ff2be
SL
190 let url = parts.next().unwrap();
191
54a0048b 192 // Once we've plucked out the URL, parse it using our base url and
3157f602 193 // then try to extract a file path.
476ff2be 194 let mut path = file.to_path_buf();
7cac9316 195 if !base.is_empty() || !url.is_empty() {
32a655c1 196 path.pop();
7cac9316 197 for part in Path::new(base).join(url).components() {
32a655c1 198 match part {
dfeec247 199 Component::Prefix(_) | Component::RootDir => {
2c00a5a8
XL
200 // Avoid absolute paths as they make the docs not
201 // relocatable by making assumptions on where the docs
202 // are hosted relative to the site root.
203 *errors = true;
dfeec247
XL
204 println!(
205 "{}:{}: absolute path - {}",
206 pretty_file.display(),
207 i + 1,
208 Path::new(base).join(url).display()
209 );
2c00a5a8
XL
210 return;
211 }
32a655c1 212 Component::CurDir => {}
dfeec247
XL
213 Component::ParentDir => {
214 path.pop();
215 }
216 Component::Normal(s) => {
217 path.push(s);
218 }
32a655c1 219 }
3157f602 220 }
476ff2be 221 }
54a0048b
SL
222
223 // Alright, if we've found a file name then this file had better
224 // exist! If it doesn't then we register and print an error.
225 if path.exists() {
226 if path.is_dir() {
3157f602
XL
227 // Links to directories show as directory listings when viewing
228 // the docs offline so it's best to avoid them.
229 *errors = true;
230 let pretty_path = path.strip_prefix(root).unwrap_or(&path);
dfeec247
XL
231 println!(
232 "{}:{}: directory link - {}",
233 pretty_file.display(),
234 i + 1,
235 pretty_path.display()
236 );
54a0048b
SL
237 return;
238 }
7cac9316
XL
239 if let Some(extension) = path.extension() {
240 // Ignore none HTML files.
241 if extension != "html" {
242 return;
243 }
244 }
245 let res = load_file(cache, root, &path, FromRedirect(false));
54a0048b
SL
246 let (pretty_path, contents) = match res {
247 Ok(res) => res,
8bb4bdeb 248 Err(LoadError::IOError(err)) => {
7cac9316 249 panic!("error loading {}: {}", path.display(), err);
8bb4bdeb 250 }
54a0048b 251 Err(LoadError::BrokenRedirect(target, _)) => {
3157f602 252 *errors = true;
dfeec247
XL
253 println!(
254 "{}:{}: broken redirect to {}",
255 pretty_file.display(),
256 i + 1,
257 target.display()
258 );
54a0048b
SL
259 return;
260 }
261 Err(LoadError::IsRedirect) => unreachable!(),
262 };
263
476ff2be 264 if let Some(ref fragment) = fragment {
54a0048b
SL
265 // Fragments like `#1-6` are most likely line numbers to be
266 // interpreted by javascript, so we're ignoring these
dfeec247 267 if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
54a0048b
SL
268 return;
269 }
270
83c7162d 271 // These appear to be broken in mdbook right now?
3dfed10e 272 if fragment.starts_with('-') {
83c7162d
XL
273 return;
274 }
275
54a0048b
SL
276 let entry = &mut cache.get_mut(&pretty_path).unwrap();
277 entry.parse_ids(&pretty_path, &contents, errors);
278
3dfed10e
XL
279 if !entry.ids.contains(*fragment) && !is_exception(file, &format!("#{}", fragment))
280 {
54a0048b 281 *errors = true;
dfeec247 282 print!("{}:{}: broken link fragment ", pretty_file.display(), i + 1);
3157f602 283 println!("`#{}` pointing to `{}`", fragment, pretty_path.display());
54a0048b
SL
284 };
285 }
286 } else {
54a0048b 287 let pretty_path = path.strip_prefix(root).unwrap_or(&path);
3dfed10e
XL
288 if !is_exception(file, pretty_path.to_str().unwrap()) {
289 *errors = true;
290 print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
291 println!("{}", pretty_path.display());
292 }
54a0048b
SL
293 }
294 });
295 Some(pretty_file)
296}
297
dfeec247
XL
298fn load_file(
299 cache: &mut Cache,
300 root: &Path,
301 file: &Path,
302 redirect: Redirect,
303) -> Result<(PathBuf, Rc<String>), LoadError> {
54a0048b
SL
304 let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file));
305
a1dfa0c6 306 let (maybe_redirect, contents) = match cache.entry(pretty_file.clone()) {
dfeec247 307 Entry::Occupied(entry) => (None, entry.get().source.clone()),
54a0048b 308 Entry::Vacant(entry) => {
a1dfa0c6
XL
309 let contents = match fs::read_to_string(file) {
310 Ok(s) => Rc::new(s),
311 Err(err) => {
312 return Err(if let FromRedirect(true) = redirect {
313 LoadError::BrokenRedirect(file.to_path_buf(), err)
314 } else {
315 LoadError::IOError(err)
dfeec247 316 });
54a0048b 317 }
a1dfa0c6 318 };
54a0048b
SL
319
320 let maybe = maybe_redirect(&contents);
321 if maybe.is_some() {
322 if let SkipRedirect = redirect {
323 return Err(LoadError::IsRedirect);
324 }
325 } else {
dfeec247 326 entry.insert(FileEntry { source: contents.clone(), ids: HashSet::new() });
54a0048b 327 }
a1dfa0c6 328 (maybe, contents)
3157f602 329 }
54a0048b 330 };
7cac9316 331 match maybe_redirect.map(|url| file.parent().unwrap().join(url)) {
dfeec247 332 Some(redirect_file) => load_file(cache, root, &redirect_file, FromRedirect(true)),
3157f602 333 None => Ok((pretty_file, contents)),
54a0048b
SL
334 }
335}
336
337fn maybe_redirect(source: &str) -> Option<String> {
3dfed10e 338 const REDIRECT: &str = "<p>Redirecting to <a href=";
54a0048b
SL
339
340 let mut lines = source.lines();
0731742a 341 let redirect_line = lines.nth(6)?;
54a0048b
SL
342
343 redirect_line.find(REDIRECT).map(|i| {
344 let rest = &redirect_line[(i + REDIRECT.len() + 1)..];
345 let pos_quote = rest.find('"').unwrap();
346 rest[..pos_quote].to_owned()
347 })
348}
349
7cac9316
XL
350fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(contents: &str, attr: &str, mut f: F) {
351 let mut base = "";
54a0048b
SL
352 for (i, mut line) in contents.lines().enumerate() {
353 while let Some(j) = line.find(attr) {
3157f602 354 let rest = &line[j + attr.len()..];
7cac9316
XL
355 // The base tag should always be the first link in the document so
356 // we can get away with using one pass.
357 let is_base = line[..j].ends_with("<base");
54a0048b 358 line = rest;
3dfed10e 359 let pos_equals = match rest.find('=') {
54a0048b
SL
360 Some(i) => i,
361 None => continue,
362 };
3dfed10e 363 if rest[..pos_equals].trim_start_matches(' ') != "" {
3157f602 364 continue;
54a0048b
SL
365 }
366
367 let rest = &rest[pos_equals + 1..];
368
369 let pos_quote = match rest.find(&['"', '\''][..]) {
370 Some(i) => i,
371 None => continue,
372 };
373 let quote_delim = rest.as_bytes()[pos_quote] as char;
374
3dfed10e 375 if rest[..pos_quote].trim_start_matches(' ') != "" {
3157f602 376 continue;
54a0048b
SL
377 }
378 let rest = &rest[pos_quote + 1..];
379 let url = match rest.find(quote_delim) {
380 Some(i) => &rest[..i],
381 None => continue,
382 };
7cac9316
XL
383 if is_base {
384 base = url;
385 continue;
386 }
387 f(url, i, base)
54a0048b
SL
388 }
389 }
390}