]> git.proxmox.com Git - rustc.git/blob - src/tools/linkchecker/main.rs
New upstream version 1.48.0~beta.8+dfsg1
[rustc.git] / src / tools / linkchecker / main.rs
1 //! Script to check the validity of `href` links in our HTML documentation.
2 //!
3 //! In the past we've been quite error prone to writing in broken links as most
4 //! of them are manually rather than automatically added. As files move over
5 //! time or apis change old links become stale or broken. The purpose of this
6 //! script is to check all relative links in our documentation to make sure they
7 //! actually point to a valid place.
8 //!
9 //! Currently this doesn't actually do any HTML parsing or anything fancy like
10 //! that, it just has a simple "regex" to search for `href` and `id` tags.
11 //! These values are then translated to file URLs if possible and then the
12 //! destination is asserted to exist.
13 //!
14 //! A few exceptions are allowed as there's known bugs in rustdoc, but this
15 //! should catch the majority of "broken link" cases.
16
17 use std::collections::hash_map::Entry;
18 use std::collections::{HashMap, HashSet};
19 use std::env;
20 use std::fs;
21 use std::path::{Component, Path, PathBuf};
22 use std::rc::Rc;
23
24 use crate::Redirect::*;
25
26 // Add linkcheck exceptions here
27 // If at all possible you should use intra-doc links to avoid linkcheck issues. These
28 // are cases where that does not work
29 // [(generated_documentation_page, &[broken_links])]
30 const LINKCHECK_EXCEPTIONS: &[(&str, &[&str])] = &[
31 // These are methods on slice, and `Self` does not work on primitive impls
32 // in intra-doc links (primitive impls are weird)
33 // https://github.com/rust-lang/rust/issues/62834 is necessary to be
34 // able to link to slices
35 (
36 "std/io/struct.IoSlice.html",
37 &[
38 "#method.as_mut_ptr",
39 "#method.sort_by_key",
40 "#method.make_ascii_uppercase",
41 "#method.make_ascii_lowercase",
42 "#method.get_unchecked_mut",
43 ],
44 ),
45 // These try to link to std::collections, but are defined in alloc
46 // https://github.com/rust-lang/rust/issues/74481
47 ("std/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
48 ("std/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
49 ("alloc/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]),
50 ("alloc/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]),
51 ];
52
53 macro_rules! t {
54 ($e:expr) => {
55 match $e {
56 Ok(e) => e,
57 Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
58 }
59 };
60 }
61
62 fn main() {
63 let docs = env::args_os().nth(1).unwrap();
64 let docs = env::current_dir().unwrap().join(docs);
65 let mut errors = false;
66 walk(&mut HashMap::new(), &docs, &docs, &mut errors);
67 if errors {
68 panic!("found some broken links");
69 }
70 }
71
72 #[derive(Debug)]
73 pub enum LoadError {
74 IOError(std::io::Error),
75 BrokenRedirect(PathBuf, std::io::Error),
76 IsRedirect,
77 }
78
79 enum Redirect {
80 SkipRedirect,
81 FromRedirect(bool),
82 }
83
84 struct FileEntry {
85 source: Rc<String>,
86 ids: HashSet<String>,
87 }
88
89 type Cache = HashMap<PathBuf, FileEntry>;
90
91 fn small_url_encode(s: &str) -> String {
92 s.replace("<", "%3C")
93 .replace(">", "%3E")
94 .replace(" ", "%20")
95 .replace("?", "%3F")
96 .replace("'", "%27")
97 .replace("&", "%26")
98 .replace(",", "%2C")
99 .replace(":", "%3A")
100 .replace(";", "%3B")
101 .replace("[", "%5B")
102 .replace("]", "%5D")
103 .replace("\"", "%22")
104 }
105
106 impl FileEntry {
107 fn parse_ids(&mut self, file: &Path, contents: &str, errors: &mut bool) {
108 if self.ids.is_empty() {
109 with_attrs_in_source(contents, " id", |fragment, i, _| {
110 let frag = fragment.trim_start_matches("#").to_owned();
111 let encoded = small_url_encode(&frag);
112 if !self.ids.insert(frag) {
113 *errors = true;
114 println!("{}:{}: id is not unique: `{}`", file.display(), i, fragment);
115 }
116 // Just in case, we also add the encoded id.
117 self.ids.insert(encoded);
118 });
119 }
120 }
121 }
122
123 fn walk(cache: &mut Cache, root: &Path, dir: &Path, errors: &mut bool) {
124 for entry in t!(dir.read_dir()).map(|e| t!(e)) {
125 let path = entry.path();
126 let kind = t!(entry.file_type());
127 if kind.is_dir() {
128 walk(cache, root, &path, errors);
129 } else {
130 let pretty_path = check(cache, root, &path, errors);
131 if let Some(pretty_path) = pretty_path {
132 let entry = cache.get_mut(&pretty_path).unwrap();
133 // we don't need the source anymore,
134 // so drop to reduce memory-usage
135 entry.source = Rc::new(String::new());
136 }
137 }
138 }
139 }
140
141 fn is_exception(file: &Path, link: &str) -> bool {
142 if let Some(entry) = LINKCHECK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) {
143 entry.1.contains(&link)
144 } else {
145 // FIXME(#63351): Concat trait in alloc/slice reexported in primitive page
146 //
147 // NOTE: This cannot be added to `LINKCHECK_EXCEPTIONS` because the resolved path
148 // calculated in `check` function is outside `build/<triple>/doc` dir.
149 // So the `strip_prefix` method just returns the old absolute broken path.
150 if file.ends_with("std/primitive.slice.html") {
151 if link.ends_with("primitive.slice.html") {
152 return true;
153 }
154 }
155 false
156 }
157 }
158
159 fn check(cache: &mut Cache, root: &Path, file: &Path, errors: &mut bool) -> Option<PathBuf> {
160 // Ignore non-HTML files.
161 if file.extension().and_then(|s| s.to_str()) != Some("html") {
162 return None;
163 }
164
165 let res = load_file(cache, root, file, SkipRedirect);
166 let (pretty_file, contents) = match res {
167 Ok(res) => res,
168 Err(_) => return None,
169 };
170 {
171 cache.get_mut(&pretty_file).unwrap().parse_ids(&pretty_file, &contents, errors);
172 }
173
174 // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
175 with_attrs_in_source(&contents, " href", |url, i, base| {
176 // Ignore external URLs
177 if url.starts_with("http:")
178 || url.starts_with("https:")
179 || url.starts_with("javascript:")
180 || url.starts_with("ftp:")
181 || url.starts_with("irc:")
182 || url.starts_with("data:")
183 {
184 return;
185 }
186 let mut parts = url.splitn(2, '#');
187 let url = parts.next().unwrap();
188 let fragment = parts.next();
189 let mut parts = url.splitn(2, '?');
190 let url = parts.next().unwrap();
191
192 // Once we've plucked out the URL, parse it using our base url and
193 // then try to extract a file path.
194 let mut path = file.to_path_buf();
195 if !base.is_empty() || !url.is_empty() {
196 path.pop();
197 for part in Path::new(base).join(url).components() {
198 match part {
199 Component::Prefix(_) | Component::RootDir => {
200 // Avoid absolute paths as they make the docs not
201 // relocatable by making assumptions on where the docs
202 // are hosted relative to the site root.
203 *errors = true;
204 println!(
205 "{}:{}: absolute path - {}",
206 pretty_file.display(),
207 i + 1,
208 Path::new(base).join(url).display()
209 );
210 return;
211 }
212 Component::CurDir => {}
213 Component::ParentDir => {
214 path.pop();
215 }
216 Component::Normal(s) => {
217 path.push(s);
218 }
219 }
220 }
221 }
222
223 // Alright, if we've found a file name then this file had better
224 // exist! If it doesn't then we register and print an error.
225 if path.exists() {
226 if path.is_dir() {
227 // Links to directories show as directory listings when viewing
228 // the docs offline so it's best to avoid them.
229 *errors = true;
230 let pretty_path = path.strip_prefix(root).unwrap_or(&path);
231 println!(
232 "{}:{}: directory link - {}",
233 pretty_file.display(),
234 i + 1,
235 pretty_path.display()
236 );
237 return;
238 }
239 if let Some(extension) = path.extension() {
240 // Ignore none HTML files.
241 if extension != "html" {
242 return;
243 }
244 }
245 let res = load_file(cache, root, &path, FromRedirect(false));
246 let (pretty_path, contents) = match res {
247 Ok(res) => res,
248 Err(LoadError::IOError(err)) => {
249 panic!("error loading {}: {}", path.display(), err);
250 }
251 Err(LoadError::BrokenRedirect(target, _)) => {
252 *errors = true;
253 println!(
254 "{}:{}: broken redirect to {}",
255 pretty_file.display(),
256 i + 1,
257 target.display()
258 );
259 return;
260 }
261 Err(LoadError::IsRedirect) => unreachable!(),
262 };
263
264 if let Some(ref fragment) = fragment {
265 // Fragments like `#1-6` are most likely line numbers to be
266 // interpreted by javascript, so we're ignoring these
267 if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) {
268 return;
269 }
270
271 // These appear to be broken in mdbook right now?
272 if fragment.starts_with('-') {
273 return;
274 }
275
276 let entry = &mut cache.get_mut(&pretty_path).unwrap();
277 entry.parse_ids(&pretty_path, &contents, errors);
278
279 if !entry.ids.contains(*fragment) && !is_exception(file, &format!("#{}", fragment))
280 {
281 *errors = true;
282 print!("{}:{}: broken link fragment ", pretty_file.display(), i + 1);
283 println!("`#{}` pointing to `{}`", fragment, pretty_path.display());
284 };
285 }
286 } else {
287 let pretty_path = path.strip_prefix(root).unwrap_or(&path);
288 if !is_exception(file, pretty_path.to_str().unwrap()) {
289 *errors = true;
290 print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
291 println!("{}", pretty_path.display());
292 }
293 }
294 });
295 Some(pretty_file)
296 }
297
298 fn load_file(
299 cache: &mut Cache,
300 root: &Path,
301 file: &Path,
302 redirect: Redirect,
303 ) -> Result<(PathBuf, Rc<String>), LoadError> {
304 let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file));
305
306 let (maybe_redirect, contents) = match cache.entry(pretty_file.clone()) {
307 Entry::Occupied(entry) => (None, entry.get().source.clone()),
308 Entry::Vacant(entry) => {
309 let contents = match fs::read_to_string(file) {
310 Ok(s) => Rc::new(s),
311 Err(err) => {
312 return Err(if let FromRedirect(true) = redirect {
313 LoadError::BrokenRedirect(file.to_path_buf(), err)
314 } else {
315 LoadError::IOError(err)
316 });
317 }
318 };
319
320 let maybe = maybe_redirect(&contents);
321 if maybe.is_some() {
322 if let SkipRedirect = redirect {
323 return Err(LoadError::IsRedirect);
324 }
325 } else {
326 entry.insert(FileEntry { source: contents.clone(), ids: HashSet::new() });
327 }
328 (maybe, contents)
329 }
330 };
331 match maybe_redirect.map(|url| file.parent().unwrap().join(url)) {
332 Some(redirect_file) => load_file(cache, root, &redirect_file, FromRedirect(true)),
333 None => Ok((pretty_file, contents)),
334 }
335 }
336
337 fn maybe_redirect(source: &str) -> Option<String> {
338 const REDIRECT: &str = "<p>Redirecting to <a href=";
339
340 let mut lines = source.lines();
341 let redirect_line = lines.nth(6)?;
342
343 redirect_line.find(REDIRECT).map(|i| {
344 let rest = &redirect_line[(i + REDIRECT.len() + 1)..];
345 let pos_quote = rest.find('"').unwrap();
346 rest[..pos_quote].to_owned()
347 })
348 }
349
350 fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(contents: &str, attr: &str, mut f: F) {
351 let mut base = "";
352 for (i, mut line) in contents.lines().enumerate() {
353 while let Some(j) = line.find(attr) {
354 let rest = &line[j + attr.len()..];
355 // The base tag should always be the first link in the document so
356 // we can get away with using one pass.
357 let is_base = line[..j].ends_with("<base");
358 line = rest;
359 let pos_equals = match rest.find('=') {
360 Some(i) => i,
361 None => continue,
362 };
363 if rest[..pos_equals].trim_start_matches(' ') != "" {
364 continue;
365 }
366
367 let rest = &rest[pos_equals + 1..];
368
369 let pos_quote = match rest.find(&['"', '\''][..]) {
370 Some(i) => i,
371 None => continue,
372 };
373 let quote_delim = rest.as_bytes()[pos_quote] as char;
374
375 if rest[..pos_quote].trim_start_matches(' ') != "" {
376 continue;
377 }
378 let rest = &rest[pos_quote + 1..];
379 let url = match rest.find(quote_delim) {
380 Some(i) => &rest[..i],
381 None => continue,
382 };
383 if is_base {
384 base = url;
385 continue;
386 }
387 f(url, i, base)
388 }
389 }
390 }