]>
Commit | Line | Data |
---|---|---|
54a0048b SL |
1 | //! Script to check the validity of `href` links in our HTML documentation. |
2 | //! | |
3 | //! In the past we've been quite error prone to writing in broken links as most | |
4 | //! of them are manually rather than automatically added. As files move over | |
5 | //! time or apis change old links become stale or broken. The purpose of this | |
6 | //! script is to check all relative links in our documentation to make sure they | |
7 | //! actually point to a valid place. | |
8 | //! | |
9 | //! Currently this doesn't actually do any HTML parsing or anything fancy like | |
10 | //! that, it just has a simple "regex" to search for `href` and `id` tags. | |
11 | //! These values are then translated to file URLs if possible and then the | |
12 | //! destination is asserted to exist. | |
13 | //! | |
f035d41b XL |
14 | //! A few exceptions are allowed as there's known bugs in rustdoc, but this |
15 | //! should catch the majority of "broken link" cases. | |
e1599b0c | 16 | |
a1dfa0c6 XL |
17 | use std::collections::hash_map::Entry; |
18 | use std::collections::{HashMap, HashSet}; | |
54a0048b | 19 | use std::env; |
a1dfa0c6 | 20 | use std::fs; |
dfeec247 | 21 | use std::path::{Component, Path, PathBuf}; |
a1dfa0c6 | 22 | use std::rc::Rc; |
54a0048b | 23 | |
9fa01778 | 24 | use crate::Redirect::*; |
54a0048b | 25 | |
3dfed10e XL |
26 | // Add linkcheck exceptions here |
27 | // If at all possible you should use intra-doc links to avoid linkcheck issues. These | |
28 | // are cases where that does not work | |
29 | // [(generated_documentation_page, &[broken_links])] | |
30 | const LINKCHECK_EXCEPTIONS: &[(&str, &[&str])] = &[ | |
31 | // These are methods on slice, and `Self` does not work on primitive impls | |
32 | // in intra-doc links (primitive impls are weird) | |
33 | // https://github.com/rust-lang/rust/issues/62834 is necessary to be | |
34 | // able to link to slices | |
35 | ( | |
36 | "std/io/struct.IoSlice.html", | |
37 | &[ | |
38 | "#method.as_mut_ptr", | |
39 | "#method.sort_by_key", | |
40 | "#method.make_ascii_uppercase", | |
41 | "#method.make_ascii_lowercase", | |
1b1a35ee | 42 | "#method.get_unchecked_mut", |
3dfed10e XL |
43 | ], |
44 | ), | |
45 | // These try to link to std::collections, but are defined in alloc | |
46 | // https://github.com/rust-lang/rust/issues/74481 | |
47 | ("std/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]), | |
48 | ("std/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]), | |
49 | ("alloc/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]), | |
50 | ("alloc/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]), | |
51 | ]; | |
52 | ||
54a0048b | 53 | macro_rules! t { |
dfeec247 XL |
54 | ($e:expr) => { |
55 | match $e { | |
56 | Ok(e) => e, | |
57 | Err(e) => panic!("{} failed with {:?}", stringify!($e), e), | |
58 | } | |
59 | }; | |
54a0048b SL |
60 | } |
61 | ||
62 | fn main() { | |
7cac9316 | 63 | let docs = env::args_os().nth(1).unwrap(); |
54a0048b | 64 | let docs = env::current_dir().unwrap().join(docs); |
54a0048b | 65 | let mut errors = false; |
476ff2be | 66 | walk(&mut HashMap::new(), &docs, &docs, &mut errors); |
54a0048b SL |
67 | if errors { |
68 | panic!("found some broken links"); | |
69 | } | |
70 | } | |
71 | ||
72 | #[derive(Debug)] | |
73 | pub enum LoadError { | |
74 | IOError(std::io::Error), | |
75 | BrokenRedirect(PathBuf, std::io::Error), | |
76 | IsRedirect, | |
77 | } | |
78 | ||
79 | enum Redirect { | |
80 | SkipRedirect, | |
81 | FromRedirect(bool), | |
82 | } | |
83 | ||
84 | struct FileEntry { | |
a1dfa0c6 | 85 | source: Rc<String>, |
54a0048b SL |
86 | ids: HashSet<String>, |
87 | } | |
88 | ||
89 | type Cache = HashMap<PathBuf, FileEntry>; | |
90 | ||
abe05a73 XL |
91 | fn small_url_encode(s: &str) -> String { |
92 | s.replace("<", "%3C") | |
dfeec247 XL |
93 | .replace(">", "%3E") |
94 | .replace(" ", "%20") | |
95 | .replace("?", "%3F") | |
96 | .replace("'", "%27") | |
97 | .replace("&", "%26") | |
98 | .replace(",", "%2C") | |
99 | .replace(":", "%3A") | |
100 | .replace(";", "%3B") | |
101 | .replace("[", "%5B") | |
102 | .replace("]", "%5D") | |
103 | .replace("\"", "%22") | |
abe05a73 XL |
104 | } |
105 | ||
54a0048b | 106 | impl FileEntry { |
3157f602 | 107 | fn parse_ids(&mut self, file: &Path, contents: &str, errors: &mut bool) { |
54a0048b | 108 | if self.ids.is_empty() { |
7cac9316 | 109 | with_attrs_in_source(contents, " id", |fragment, i, _| { |
9fa01778 | 110 | let frag = fragment.trim_start_matches("#").to_owned(); |
abe05a73 | 111 | let encoded = small_url_encode(&frag); |
54a0048b SL |
112 | if !self.ids.insert(frag) { |
113 | *errors = true; | |
3157f602 | 114 | println!("{}:{}: id is not unique: `{}`", file.display(), i, fragment); |
54a0048b | 115 | } |
abe05a73 XL |
116 | // Just in case, we also add the encoded id. |
117 | self.ids.insert(encoded); | |
54a0048b SL |
118 | }); |
119 | } | |
120 | } | |
121 | } | |
122 | ||
476ff2be | 123 | fn walk(cache: &mut Cache, root: &Path, dir: &Path, errors: &mut bool) { |
54a0048b SL |
124 | for entry in t!(dir.read_dir()).map(|e| t!(e)) { |
125 | let path = entry.path(); | |
126 | let kind = t!(entry.file_type()); | |
54a0048b | 127 | if kind.is_dir() { |
476ff2be | 128 | walk(cache, root, &path, errors); |
54a0048b | 129 | } else { |
476ff2be | 130 | let pretty_path = check(cache, root, &path, errors); |
54a0048b SL |
131 | if let Some(pretty_path) = pretty_path { |
132 | let entry = cache.get_mut(&pretty_path).unwrap(); | |
133 | // we don't need the source anymore, | |
a7813a04 | 134 | // so drop to reduce memory-usage |
a1dfa0c6 | 135 | entry.source = Rc::new(String::new()); |
54a0048b SL |
136 | } |
137 | } | |
54a0048b SL |
138 | } |
139 | } | |
140 | ||
3dfed10e XL |
141 | fn is_exception(file: &Path, link: &str) -> bool { |
142 | if let Some(entry) = LINKCHECK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) { | |
143 | entry.1.contains(&link) | |
144 | } else { | |
6c58768f XL |
145 | // FIXME(#63351): Concat trait in alloc/slice reexported in primitive page |
146 | // | |
147 | // NOTE: This cannot be added to `LINKCHECK_EXCEPTIONS` because the resolved path | |
148 | // calculated in `check` function is outside `build/<triple>/doc` dir. | |
149 | // So the `strip_prefix` method just returns the old absolute broken path. | |
150 | if file.ends_with("std/primitive.slice.html") { | |
151 | if link.ends_with("primitive.slice.html") { | |
152 | return true; | |
153 | } | |
154 | } | |
3dfed10e XL |
155 | false |
156 | } | |
157 | } | |
158 | ||
dfeec247 | 159 | fn check(cache: &mut Cache, root: &Path, file: &Path, errors: &mut bool) -> Option<PathBuf> { |
f9f354fc | 160 | // Ignore non-HTML files. |
7cac9316 | 161 | if file.extension().and_then(|s| s.to_str()) != Some("html") { |
cc61c64b XL |
162 | return None; |
163 | } | |
164 | ||
7cac9316 | 165 | let res = load_file(cache, root, file, SkipRedirect); |
54a0048b SL |
166 | let (pretty_file, contents) = match res { |
167 | Ok(res) => res, | |
168 | Err(_) => return None, | |
169 | }; | |
170 | { | |
dfeec247 | 171 | cache.get_mut(&pretty_file).unwrap().parse_ids(&pretty_file, &contents, errors); |
54a0048b SL |
172 | } |
173 | ||
174 | // Search for anything that's the regex 'href[ ]*=[ ]*".*?"' | |
7cac9316 | 175 | with_attrs_in_source(&contents, " href", |url, i, base| { |
3157f602 | 176 | // Ignore external URLs |
dfeec247 XL |
177 | if url.starts_with("http:") |
178 | || url.starts_with("https:") | |
179 | || url.starts_with("javascript:") | |
180 | || url.starts_with("ftp:") | |
181 | || url.starts_with("irc:") | |
182 | || url.starts_with("data:") | |
183 | { | |
3157f602 XL |
184 | return; |
185 | } | |
3dfed10e | 186 | let mut parts = url.splitn(2, '#'); |
476ff2be | 187 | let url = parts.next().unwrap(); |
476ff2be | 188 | let fragment = parts.next(); |
3dfed10e | 189 | let mut parts = url.splitn(2, '?'); |
476ff2be SL |
190 | let url = parts.next().unwrap(); |
191 | ||
54a0048b | 192 | // Once we've plucked out the URL, parse it using our base url and |
3157f602 | 193 | // then try to extract a file path. |
476ff2be | 194 | let mut path = file.to_path_buf(); |
7cac9316 | 195 | if !base.is_empty() || !url.is_empty() { |
32a655c1 | 196 | path.pop(); |
7cac9316 | 197 | for part in Path::new(base).join(url).components() { |
32a655c1 | 198 | match part { |
dfeec247 | 199 | Component::Prefix(_) | Component::RootDir => { |
2c00a5a8 XL |
200 | // Avoid absolute paths as they make the docs not |
201 | // relocatable by making assumptions on where the docs | |
202 | // are hosted relative to the site root. | |
203 | *errors = true; | |
dfeec247 XL |
204 | println!( |
205 | "{}:{}: absolute path - {}", | |
206 | pretty_file.display(), | |
207 | i + 1, | |
208 | Path::new(base).join(url).display() | |
209 | ); | |
2c00a5a8 XL |
210 | return; |
211 | } | |
32a655c1 | 212 | Component::CurDir => {} |
dfeec247 XL |
213 | Component::ParentDir => { |
214 | path.pop(); | |
215 | } | |
216 | Component::Normal(s) => { | |
217 | path.push(s); | |
218 | } | |
32a655c1 | 219 | } |
3157f602 | 220 | } |
476ff2be | 221 | } |
54a0048b SL |
222 | |
223 | // Alright, if we've found a file name then this file had better | |
224 | // exist! If it doesn't then we register and print an error. | |
225 | if path.exists() { | |
226 | if path.is_dir() { | |
3157f602 XL |
227 | // Links to directories show as directory listings when viewing |
228 | // the docs offline so it's best to avoid them. | |
229 | *errors = true; | |
230 | let pretty_path = path.strip_prefix(root).unwrap_or(&path); | |
dfeec247 XL |
231 | println!( |
232 | "{}:{}: directory link - {}", | |
233 | pretty_file.display(), | |
234 | i + 1, | |
235 | pretty_path.display() | |
236 | ); | |
54a0048b SL |
237 | return; |
238 | } | |
7cac9316 XL |
239 | if let Some(extension) = path.extension() { |
240 | // Ignore none HTML files. | |
241 | if extension != "html" { | |
242 | return; | |
243 | } | |
244 | } | |
245 | let res = load_file(cache, root, &path, FromRedirect(false)); | |
54a0048b SL |
246 | let (pretty_path, contents) = match res { |
247 | Ok(res) => res, | |
8bb4bdeb | 248 | Err(LoadError::IOError(err)) => { |
7cac9316 | 249 | panic!("error loading {}: {}", path.display(), err); |
8bb4bdeb | 250 | } |
54a0048b | 251 | Err(LoadError::BrokenRedirect(target, _)) => { |
3157f602 | 252 | *errors = true; |
dfeec247 XL |
253 | println!( |
254 | "{}:{}: broken redirect to {}", | |
255 | pretty_file.display(), | |
256 | i + 1, | |
257 | target.display() | |
258 | ); | |
54a0048b SL |
259 | return; |
260 | } | |
261 | Err(LoadError::IsRedirect) => unreachable!(), | |
262 | }; | |
263 | ||
476ff2be | 264 | if let Some(ref fragment) = fragment { |
54a0048b SL |
265 | // Fragments like `#1-6` are most likely line numbers to be |
266 | // interpreted by javascript, so we're ignoring these | |
dfeec247 | 267 | if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) { |
54a0048b SL |
268 | return; |
269 | } | |
270 | ||
83c7162d | 271 | // These appear to be broken in mdbook right now? |
3dfed10e | 272 | if fragment.starts_with('-') { |
83c7162d XL |
273 | return; |
274 | } | |
275 | ||
54a0048b SL |
276 | let entry = &mut cache.get_mut(&pretty_path).unwrap(); |
277 | entry.parse_ids(&pretty_path, &contents, errors); | |
278 | ||
3dfed10e XL |
279 | if !entry.ids.contains(*fragment) && !is_exception(file, &format!("#{}", fragment)) |
280 | { | |
54a0048b | 281 | *errors = true; |
dfeec247 | 282 | print!("{}:{}: broken link fragment ", pretty_file.display(), i + 1); |
3157f602 | 283 | println!("`#{}` pointing to `{}`", fragment, pretty_path.display()); |
54a0048b SL |
284 | }; |
285 | } | |
286 | } else { | |
54a0048b | 287 | let pretty_path = path.strip_prefix(root).unwrap_or(&path); |
3dfed10e XL |
288 | if !is_exception(file, pretty_path.to_str().unwrap()) { |
289 | *errors = true; | |
290 | print!("{}:{}: broken link - ", pretty_file.display(), i + 1); | |
291 | println!("{}", pretty_path.display()); | |
292 | } | |
54a0048b SL |
293 | } |
294 | }); | |
295 | Some(pretty_file) | |
296 | } | |
297 | ||
dfeec247 XL |
298 | fn load_file( |
299 | cache: &mut Cache, | |
300 | root: &Path, | |
301 | file: &Path, | |
302 | redirect: Redirect, | |
303 | ) -> Result<(PathBuf, Rc<String>), LoadError> { | |
54a0048b SL |
304 | let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file)); |
305 | ||
a1dfa0c6 | 306 | let (maybe_redirect, contents) = match cache.entry(pretty_file.clone()) { |
dfeec247 | 307 | Entry::Occupied(entry) => (None, entry.get().source.clone()), |
54a0048b | 308 | Entry::Vacant(entry) => { |
a1dfa0c6 XL |
309 | let contents = match fs::read_to_string(file) { |
310 | Ok(s) => Rc::new(s), | |
311 | Err(err) => { | |
312 | return Err(if let FromRedirect(true) = redirect { | |
313 | LoadError::BrokenRedirect(file.to_path_buf(), err) | |
314 | } else { | |
315 | LoadError::IOError(err) | |
dfeec247 | 316 | }); |
54a0048b | 317 | } |
a1dfa0c6 | 318 | }; |
54a0048b SL |
319 | |
320 | let maybe = maybe_redirect(&contents); | |
321 | if maybe.is_some() { | |
322 | if let SkipRedirect = redirect { | |
323 | return Err(LoadError::IsRedirect); | |
324 | } | |
325 | } else { | |
dfeec247 | 326 | entry.insert(FileEntry { source: contents.clone(), ids: HashSet::new() }); |
54a0048b | 327 | } |
a1dfa0c6 | 328 | (maybe, contents) |
3157f602 | 329 | } |
54a0048b | 330 | }; |
7cac9316 | 331 | match maybe_redirect.map(|url| file.parent().unwrap().join(url)) { |
dfeec247 | 332 | Some(redirect_file) => load_file(cache, root, &redirect_file, FromRedirect(true)), |
3157f602 | 333 | None => Ok((pretty_file, contents)), |
54a0048b SL |
334 | } |
335 | } | |
336 | ||
337 | fn maybe_redirect(source: &str) -> Option<String> { | |
3dfed10e | 338 | const REDIRECT: &str = "<p>Redirecting to <a href="; |
54a0048b SL |
339 | |
340 | let mut lines = source.lines(); | |
0731742a | 341 | let redirect_line = lines.nth(6)?; |
54a0048b SL |
342 | |
343 | redirect_line.find(REDIRECT).map(|i| { | |
344 | let rest = &redirect_line[(i + REDIRECT.len() + 1)..]; | |
345 | let pos_quote = rest.find('"').unwrap(); | |
346 | rest[..pos_quote].to_owned() | |
347 | }) | |
348 | } | |
349 | ||
7cac9316 XL |
350 | fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(contents: &str, attr: &str, mut f: F) { |
351 | let mut base = ""; | |
54a0048b SL |
352 | for (i, mut line) in contents.lines().enumerate() { |
353 | while let Some(j) = line.find(attr) { | |
3157f602 | 354 | let rest = &line[j + attr.len()..]; |
7cac9316 XL |
355 | // The base tag should always be the first link in the document so |
356 | // we can get away with using one pass. | |
357 | let is_base = line[..j].ends_with("<base"); | |
54a0048b | 358 | line = rest; |
3dfed10e | 359 | let pos_equals = match rest.find('=') { |
54a0048b SL |
360 | Some(i) => i, |
361 | None => continue, | |
362 | }; | |
3dfed10e | 363 | if rest[..pos_equals].trim_start_matches(' ') != "" { |
3157f602 | 364 | continue; |
54a0048b SL |
365 | } |
366 | ||
367 | let rest = &rest[pos_equals + 1..]; | |
368 | ||
369 | let pos_quote = match rest.find(&['"', '\''][..]) { | |
370 | Some(i) => i, | |
371 | None => continue, | |
372 | }; | |
373 | let quote_delim = rest.as_bytes()[pos_quote] as char; | |
374 | ||
3dfed10e | 375 | if rest[..pos_quote].trim_start_matches(' ') != "" { |
3157f602 | 376 | continue; |
54a0048b SL |
377 | } |
378 | let rest = &rest[pos_quote + 1..]; | |
379 | let url = match rest.find(quote_delim) { | |
380 | Some(i) => &rest[..i], | |
381 | None => continue, | |
382 | }; | |
7cac9316 XL |
383 | if is_base { |
384 | base = url; | |
385 | continue; | |
386 | } | |
387 | f(url, i, base) | |
54a0048b SL |
388 | } |
389 | } | |
390 | } |