1 // Copyright 2016 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Script to check the validity of `href` links in our HTML documentation.
13 //! In the past we've been quite error prone to writing in broken links as most
14 //! of them are manually rather than automatically added. As files move over
15 //! time or apis change old links become stale or broken. The purpose of this
16 //! script is to check all relative links in our documentation to make sure they
17 //! actually point to a valid place.
19 //! Currently this doesn't actually do any HTML parsing or anything fancy like
20 //! that, it just has a simple "regex" to search for `href` and `id` tags.
21 //! These values are then translated to file URLs if possible and then the
22 //! destination is asserted to exist.
24 //! A few whitelisted exceptions are allowed as there's known bugs in rustdoc,
25 //! but this should catch the majority of "broken link" cases.
29 use std
::io
::prelude
::*;
30 use std
::path
::{Path, PathBuf, Component}
;
31 use std
::collections
::{HashMap, HashSet}
;
32 use std
::collections
::hash_map
::Entry
;
37 ($e
:expr
) => (match $e
{
39 Err(e
) => panic
!("{} failed with {:?}", stringify
!($e
), e
),
44 let docs
= env
::args_os().nth(1).unwrap();
45 let docs
= env
::current_dir().unwrap().join(docs
);
46 let mut errors
= false;
47 walk(&mut HashMap
::new(), &docs
, &docs
, &mut errors
);
49 panic
!("found some broken links");
55 IOError(std
::io
::Error
),
56 BrokenRedirect(PathBuf
, std
::io
::Error
),
70 type Cache
= HashMap
<PathBuf
, FileEntry
>;
72 fn small_url_encode(s
: &str) -> String
{
88 fn parse_ids(&mut self, file
: &Path
, contents
: &str, errors
: &mut bool
) {
89 if self.ids
.is_empty() {
90 with_attrs_in_source(contents
, " id", |fragment
, i
, _
| {
91 let frag
= fragment
.trim_left_matches("#").to_owned();
92 let encoded
= small_url_encode(&frag
);
93 if !self.ids
.insert(frag
) {
95 println
!("{}:{}: id is not unique: `{}`", file
.display(), i
, fragment
);
97 // Just in case, we also add the encoded id.
98 self.ids
.insert(encoded
);
104 fn walk(cache
: &mut Cache
, root
: &Path
, dir
: &Path
, errors
: &mut bool
) {
105 for entry
in t
!(dir
.read_dir()).map(|e
| t
!(e
)) {
106 let path
= entry
.path();
107 let kind
= t
!(entry
.file_type());
109 walk(cache
, root
, &path
, errors
);
111 let pretty_path
= check(cache
, root
, &path
, errors
);
112 if let Some(pretty_path
) = pretty_path
{
113 let entry
= cache
.get_mut(&pretty_path
).unwrap();
114 // we don't need the source anymore,
115 // so drop to reduce memory-usage
116 entry
.source
= String
::new();
122 fn check(cache
: &mut Cache
,
127 // Ignore none HTML files.
128 if file
.extension().and_then(|s
| s
.to_str()) != Some("html") {
132 // Unfortunately we're not 100% full of valid links today to we need a few
133 // whitelists to get this past `make check` today.
135 if file
.ends_with("std/string/struct.String.html") ||
136 file
.ends_with("interpret/struct.ValTy.html") ||
137 file
.ends_with("symbol/struct.InternedString.html") ||
138 file
.ends_with("ast/struct.ThinVec.html") ||
139 file
.ends_with("util/struct.ThinVec.html") ||
140 file
.ends_with("util/struct.RcSlice.html") ||
141 file
.ends_with("layout/struct.TyLayout.html") ||
142 file
.ends_with("humantime/struct.Timestamp.html") ||
143 file
.ends_with("log/index.html") ||
144 file
.ends_with("ty/struct.Slice.html") ||
145 file
.ends_with("ty/enum.Attributes.html") ||
146 file
.ends_with("ty/struct.SymbolName.html") {
150 if file
.ends_with("string/struct.String.html") {
154 if file
.ends_with("btree_set/struct.BTreeSet.html") ||
155 file
.ends_with("struct.BTreeSet.html") ||
156 file
.ends_with("btree_map/struct.BTreeMap.html") ||
157 file
.ends_with("hash_map/struct.HashMap.html") ||
158 file
.ends_with("hash_set/struct.HashSet.html") ||
159 file
.ends_with("sync/struct.Lrc.html") ||
160 file
.ends_with("sync/struct.RwLock.html") {
164 let res
= load_file(cache
, root
, file
, SkipRedirect
);
165 let (pretty_file
, contents
) = match res
{
167 Err(_
) => return None
,
170 cache
.get_mut(&pretty_file
)
172 .parse_ids(&pretty_file
, &contents
, errors
);
175 // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
176 with_attrs_in_source(&contents
, " href", |url
, i
, base
| {
177 // Ignore external URLs
178 if url
.starts_with("http:") || url
.starts_with("https:") ||
179 url
.starts_with("javascript:") || url
.starts_with("ftp:") ||
180 url
.starts_with("irc:") || url
.starts_with("data:") {
183 let mut parts
= url
.splitn(2, "#");
184 let url
= parts
.next().unwrap();
185 let fragment
= parts
.next();
186 let mut parts
= url
.splitn(2, "?");
187 let url
= parts
.next().unwrap();
189 // Once we've plucked out the URL, parse it using our base url and
190 // then try to extract a file path.
191 let mut path
= file
.to_path_buf();
192 if !base
.is_empty() || !url
.is_empty() {
194 for part
in Path
::new(base
).join(url
).components() {
196 Component
::Prefix(_
) |
197 Component
::RootDir
=> {
198 // Avoid absolute paths as they make the docs not
199 // relocatable by making assumptions on where the docs
200 // are hosted relative to the site root.
202 println
!("{}:{}: absolute path - {}",
203 pretty_file
.display(),
205 Path
::new(base
).join(url
).display());
208 Component
::CurDir
=> {}
209 Component
::ParentDir
=> { path.pop(); }
210 Component
::Normal(s
) => { path.push(s); }
215 // Alright, if we've found a file name then this file had better
216 // exist! If it doesn't then we register and print an error.
219 // Links to directories show as directory listings when viewing
220 // the docs offline so it's best to avoid them.
222 let pretty_path
= path
.strip_prefix(root
).unwrap_or(&path
);
223 println
!("{}:{}: directory link - {}",
224 pretty_file
.display(),
226 pretty_path
.display());
229 if let Some(extension
) = path
.extension() {
230 // Ignore none HTML files.
231 if extension
!= "html" {
235 let res
= load_file(cache
, root
, &path
, FromRedirect(false));
236 let (pretty_path
, contents
) = match res
{
238 Err(LoadError
::IOError(err
)) => {
239 panic
!("error loading {}: {}", path
.display(), err
);
241 Err(LoadError
::BrokenRedirect(target
, _
)) => {
243 println
!("{}:{}: broken redirect to {}",
244 pretty_file
.display(),
249 Err(LoadError
::IsRedirect
) => unreachable
!(),
252 if let Some(ref fragment
) = fragment
{
253 // Fragments like `#1-6` are most likely line numbers to be
254 // interpreted by javascript, so we're ignoring these
255 if fragment
.splitn(2, '
-'
)
256 .all(|f
| f
.chars().all(|c
| c
.is_numeric())) {
260 let entry
= &mut cache
.get_mut(&pretty_path
).unwrap();
261 entry
.parse_ids(&pretty_path
, &contents
, errors
);
263 if !entry
.ids
.contains(*fragment
) {
265 print
!("{}:{}: broken link fragment ",
266 pretty_file
.display(),
268 println
!("`#{}` pointing to `{}`", fragment
, pretty_path
.display());
273 print
!("{}:{}: broken link - ", pretty_file
.display(), i
+ 1);
274 let pretty_path
= path
.strip_prefix(root
).unwrap_or(&path
);
275 println
!("{}", pretty_path
.display());
281 fn load_file(cache
: &mut Cache
,
285 -> Result
<(PathBuf
, String
), LoadError
> {
286 let mut contents
= String
::new();
287 let pretty_file
= PathBuf
::from(file
.strip_prefix(root
).unwrap_or(&file
));
289 let maybe_redirect
= match cache
.entry(pretty_file
.clone()) {
290 Entry
::Occupied(entry
) => {
291 contents
= entry
.get().source
.clone();
294 Entry
::Vacant(entry
) => {
295 let mut fp
= File
::open(file
).map_err(|err
| {
296 if let FromRedirect(true) = redirect
{
297 LoadError
::BrokenRedirect(file
.to_path_buf(), err
)
299 LoadError
::IOError(err
)
302 fp
.read_to_string(&mut contents
).map_err(|err
| LoadError
::IOError(err
))?
;
304 let maybe
= maybe_redirect(&contents
);
306 if let SkipRedirect
= redirect
{
307 return Err(LoadError
::IsRedirect
);
310 entry
.insert(FileEntry
{
311 source
: contents
.clone(),
318 match maybe_redirect
.map(|url
| file
.parent().unwrap().join(url
)) {
319 Some(redirect_file
) => {
320 load_file(cache
, root
, &redirect_file
, FromRedirect(true))
322 None
=> Ok((pretty_file
, contents
)),
326 fn maybe_redirect(source
: &str) -> Option
<String
> {
327 const REDIRECT
: &'
static str = "<p>Redirecting to <a href=";
329 let mut lines
= source
.lines();
330 let redirect_line
= match lines
.nth(6) {
335 redirect_line
.find(REDIRECT
).map(|i
| {
336 let rest
= &redirect_line
[(i
+ REDIRECT
.len() + 1)..];
337 let pos_quote
= rest
.find('
"').unwrap();
338 rest[..pos_quote].to_owned()
342 fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(contents: &str, attr: &str, mut f: F) {
344 for (i, mut line) in contents.lines().enumerate() {
345 while let Some(j) = line.find(attr) {
346 let rest = &line[j + attr.len()..];
347 // The base tag should always be the first link in the document so
348 // we can get away with using one pass.
349 let is_base = line[..j].ends_with("<base
");
351 let pos_equals = match rest.find("=") {
355 if rest[..pos_equals].trim_left_matches(" ") != "" {
359 let rest = &rest[pos_equals + 1..];
361 let pos_quote = match rest.find(&['"'
, '
\''
][..]) {
365 let quote_delim
= rest
.as_bytes()[pos_quote
] as char;
367 if rest
[..pos_quote
].trim_left_matches(" ") != "" {
370 let rest
= &rest
[pos_quote
+ 1..];
371 let url
= match rest
.find(quote_delim
) {
372 Some(i
) => &rest
[..i
],