]> git.proxmox.com Git - rustc.git/blob - src/tools/linkchecker/main.rs
Imported Upstream version 1.9.0+dfsg1
[rustc.git] / src / tools / linkchecker / main.rs
1 // Copyright 2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 //! Script to check the validity of `href` links in our HTML documentation.
12 //!
13 //! In the past we've been quite error prone to writing in broken links as most
14 //! of them are manually rather than automatically added. As files move over
15 //! time or apis change old links become stale or broken. The purpose of this
16 //! script is to check all relative links in our documentation to make sure they
17 //! actually point to a valid place.
18 //!
19 //! Currently this doesn't actually do any HTML parsing or anything fancy like
20 //! that, it just has a simple "regex" to search for `href` and `id` tags.
21 //! These values are then translated to file URLs if possible and then the
22 //! destination is asserted to exist.
23 //!
24 //! A few whitelisted exceptions are allowed as there's known bugs in rustdoc,
25 //! but this should catch the majority of "broken link" cases.
26
27 extern crate url;
28
29 use std::env;
30 use std::fs::File;
31 use std::io::prelude::*;
32 use std::path::{Path, PathBuf};
33 use std::collections::{HashMap, HashSet};
34 use std::collections::hash_map::Entry;
35
36 use url::{Url, UrlParser};
37
38 use Redirect::*;
39
40 macro_rules! t {
41 ($e:expr) => (match $e {
42 Ok(e) => e,
43 Err(e) => panic!("{} failed with {:?}", stringify!($e), e),
44 })
45 }
46
47 fn main() {
48 let docs = env::args().nth(1).unwrap();
49 let docs = env::current_dir().unwrap().join(docs);
50 let mut url = Url::from_file_path(&docs).unwrap();
51 let mut errors = false;
52 walk(&mut HashMap::new(), &docs, &docs, &mut url, &mut errors);
53 if errors {
54 panic!("found some broken links");
55 }
56 }
57
58 #[derive(Debug)]
59 pub enum LoadError {
60 IOError(std::io::Error),
61 BrokenRedirect(PathBuf, std::io::Error),
62 IsRedirect,
63 }
64
65 enum Redirect {
66 SkipRedirect,
67 FromRedirect(bool),
68 }
69
70 struct FileEntry {
71 source: String,
72 ids: HashSet<String>,
73 }
74
75 type Cache = HashMap<PathBuf, FileEntry>;
76
77 impl FileEntry {
78 fn parse_ids(&mut self,
79 file: &Path,
80 contents: &str,
81 errors: &mut bool)
82 {
83 if self.ids.is_empty() {
84 with_attrs_in_source(contents, " id", |fragment, i| {
85 let frag = fragment.trim_left_matches("#").to_owned();
86 if !self.ids.insert(frag) {
87 *errors = true;
88 println!("{}:{}: id is not unique: `{}`",
89 file.display(), i, fragment);
90 }
91 });
92 }
93 }
94 }
95
96 fn walk(cache: &mut Cache,
97 root: &Path,
98 dir: &Path,
99 url: &mut Url,
100 errors: &mut bool)
101 {
102 for entry in t!(dir.read_dir()).map(|e| t!(e)) {
103 let path = entry.path();
104 let kind = t!(entry.file_type());
105 url.path_mut().unwrap().push(entry.file_name().into_string().unwrap());
106 if kind.is_dir() {
107 walk(cache, root, &path, url, errors);
108 } else {
109 let pretty_path = check(cache, root, &path, url, errors);
110 if let Some(pretty_path) = pretty_path {
111 let entry = cache.get_mut(&pretty_path).unwrap();
112 // we don't need the source anymore,
113 // so drop to to reduce memory-usage
114 entry.source = String::new();
115 }
116 }
117 url.path_mut().unwrap().pop();
118 }
119 }
120
121 fn check(cache: &mut Cache,
122 root: &Path,
123 file: &Path,
124 base: &Url,
125 errors: &mut bool) -> Option<PathBuf>
126 {
127 // ignore js files as they are not prone to errors as the rest of the
128 // documentation is and they otherwise bring up false positives.
129 if file.extension().and_then(|s| s.to_str()) == Some("js") {
130 return None;
131 }
132
133 // Unfortunately we're not 100% full of valid links today to we need a few
134 // whitelists to get this past `make check` today.
135 // FIXME(#32129)
136 if file.ends_with("std/string/struct.String.html") {
137 return None;
138 }
139 // FIXME(#32553)
140 if file.ends_with("collections/string/struct.String.html") {
141 return None;
142 }
143 // FIXME(#32130)
144 if file.ends_with("btree_set/struct.BTreeSet.html") ||
145 file.ends_with("collections/struct.BTreeSet.html") ||
146 file.ends_with("collections/btree_map/struct.BTreeMap.html") ||
147 file.ends_with("collections/hash_map/struct.HashMap.html") {
148 return None;
149 }
150
151 if file.ends_with("std/sys/ext/index.html") {
152 return None;
153 }
154
155 if let Some(file) = file.to_str() {
156 // FIXME(#31948)
157 if file.contains("ParseFloatError") {
158 return None;
159 }
160 // weird reexports, but this module is on its way out, so chalk it up to
161 // "rustdoc weirdness" and move on from there
162 if file.contains("scoped_tls") {
163 return None;
164 }
165 }
166
167 let mut parser = UrlParser::new();
168 parser.base_url(base);
169
170 let res = load_file(cache, root, PathBuf::from(file), SkipRedirect);
171 let (pretty_file, contents) = match res {
172 Ok(res) => res,
173 Err(_) => return None,
174 };
175 {
176 cache.get_mut(&pretty_file).unwrap()
177 .parse_ids(&pretty_file, &contents, errors);
178 }
179
180 // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
181 with_attrs_in_source(&contents, " href", |url, i| {
182 // Once we've plucked out the URL, parse it using our base url and
183 // then try to extract a file path. If either of these fail then we
184 // just keep going.
185 let (parsed_url, path) = match url_to_file_path(&parser, url) {
186 Some((url, path)) => (url, PathBuf::from(path)),
187 None => return,
188 };
189
190 // Alright, if we've found a file name then this file had better
191 // exist! If it doesn't then we register and print an error.
192 if path.exists() {
193 if path.is_dir() {
194 return;
195 }
196 let res = load_file(cache, root, path.clone(), FromRedirect(false));
197 let (pretty_path, contents) = match res {
198 Ok(res) => res,
199 Err(LoadError::IOError(err)) => panic!(format!("{}", err)),
200 Err(LoadError::BrokenRedirect(target, _)) => {
201 print!("{}:{}: broken redirect to {}",
202 pretty_file.display(), i + 1, target.display());
203 return;
204 }
205 Err(LoadError::IsRedirect) => unreachable!(),
206 };
207
208 if let Some(ref fragment) = parsed_url.fragment {
209 // Fragments like `#1-6` are most likely line numbers to be
210 // interpreted by javascript, so we're ignoring these
211 if fragment.splitn(2, '-')
212 .all(|f| f.chars().all(|c| c.is_numeric())) {
213 return;
214 }
215
216 let entry = &mut cache.get_mut(&pretty_path).unwrap();
217 entry.parse_ids(&pretty_path, &contents, errors);
218
219 if !entry.ids.contains(fragment) {
220 *errors = true;
221 print!("{}:{}: broken link fragment ",
222 pretty_file.display(), i + 1);
223 println!("`#{}` pointing to `{}`",
224 fragment, pretty_path.display());
225 };
226 }
227 } else {
228 *errors = true;
229 print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
230 let pretty_path = path.strip_prefix(root).unwrap_or(&path);
231 println!("{}", pretty_path.display());
232 }
233 });
234 Some(pretty_file)
235 }
236
237 fn load_file(cache: &mut Cache,
238 root: &Path,
239 file: PathBuf,
240 redirect: Redirect) -> Result<(PathBuf, String), LoadError> {
241 let mut contents = String::new();
242 let pretty_file = PathBuf::from(file.strip_prefix(root).unwrap_or(&file));
243
244 let maybe_redirect = match cache.entry(pretty_file.clone()) {
245 Entry::Occupied(entry) => {
246 contents = entry.get().source.clone();
247 None
248 },
249 Entry::Vacant(entry) => {
250 let mut fp = try!(File::open(file.clone()).map_err(|err| {
251 if let FromRedirect(true) = redirect {
252 LoadError::BrokenRedirect(file.clone(), err)
253 } else {
254 LoadError::IOError(err)
255 }
256 }));
257 try!(fp.read_to_string(&mut contents)
258 .map_err(|err| LoadError::IOError(err)));
259
260 let maybe = maybe_redirect(&contents);
261 if maybe.is_some() {
262 if let SkipRedirect = redirect {
263 return Err(LoadError::IsRedirect);
264 }
265 } else {
266 entry.insert(FileEntry {
267 source: contents.clone(),
268 ids: HashSet::new(),
269 });
270 }
271 maybe
272 },
273 };
274 let base = Url::from_file_path(&file).unwrap();
275 let mut parser = UrlParser::new();
276 parser.base_url(&base);
277
278 match maybe_redirect.and_then(|url| url_to_file_path(&parser, &url)) {
279 Some((_, redirect_file)) => {
280 let path = PathBuf::from(redirect_file);
281 load_file(cache, root, path, FromRedirect(true))
282 }
283 None => Ok((pretty_file, contents))
284 }
285 }
286
287 fn maybe_redirect(source: &str) -> Option<String> {
288 const REDIRECT: &'static str = "<p>Redirecting to <a href=";
289
290 let mut lines = source.lines();
291 let redirect_line = match lines.nth(6) {
292 Some(l) => l,
293 None => return None,
294 };
295
296 redirect_line.find(REDIRECT).map(|i| {
297 let rest = &redirect_line[(i + REDIRECT.len() + 1)..];
298 let pos_quote = rest.find('"').unwrap();
299 rest[..pos_quote].to_owned()
300 })
301 }
302
303 fn url_to_file_path(parser: &UrlParser, url: &str) -> Option<(Url, PathBuf)> {
304 parser.parse(url).ok().and_then(|parsed_url| {
305 parsed_url.to_file_path().ok().map(|f| (parsed_url, f))
306 })
307 }
308
309 fn with_attrs_in_source<F: FnMut(&str, usize)>(contents: &str,
310 attr: &str,
311 mut f: F)
312 {
313 for (i, mut line) in contents.lines().enumerate() {
314 while let Some(j) = line.find(attr) {
315 let rest = &line[j + attr.len() ..];
316 line = rest;
317 let pos_equals = match rest.find("=") {
318 Some(i) => i,
319 None => continue,
320 };
321 if rest[..pos_equals].trim_left_matches(" ") != "" {
322 continue
323 }
324
325 let rest = &rest[pos_equals + 1..];
326
327 let pos_quote = match rest.find(&['"', '\''][..]) {
328 Some(i) => i,
329 None => continue,
330 };
331 let quote_delim = rest.as_bytes()[pos_quote] as char;
332
333 if rest[..pos_quote].trim_left_matches(" ") != "" {
334 continue
335 }
336 let rest = &rest[pos_quote + 1..];
337 let url = match rest.find(quote_delim) {
338 Some(i) => &rest[..i],
339 None => continue,
340 };
341 f(url, i)
342 }
343 }
344 }