]>
Commit | Line | Data |
---|---|---|
54a0048b SL |
1 | //! Script to check the validity of `href` links in our HTML documentation. |
2 | //! | |
3 | //! In the past we've been quite error prone to writing in broken links as most | |
4 | //! of them are manually rather than automatically added. As files move over | |
5 | //! time or apis change old links become stale or broken. The purpose of this | |
6 | //! script is to check all relative links in our documentation to make sure they | |
7 | //! actually point to a valid place. | |
8 | //! | |
9 | //! Currently this doesn't actually do any HTML parsing or anything fancy like | |
10 | //! that, it just has a simple "regex" to search for `href` and `id` tags. | |
11 | //! These values are then translated to file URLs if possible and then the | |
12 | //! destination is asserted to exist. | |
13 | //! | |
f035d41b XL |
14 | //! A few exceptions are allowed as there's known bugs in rustdoc, but this |
15 | //! should catch the majority of "broken link" cases. | |
e1599b0c | 16 | |
17df50a5 | 17 | use std::cell::RefCell; |
a1dfa0c6 | 18 | use std::collections::{HashMap, HashSet}; |
54a0048b | 19 | use std::env; |
a1dfa0c6 | 20 | use std::fs; |
17df50a5 | 21 | use std::io::ErrorKind; |
dfeec247 | 22 | use std::path::{Component, Path, PathBuf}; |
a1dfa0c6 | 23 | use std::rc::Rc; |
17df50a5 | 24 | use std::time::Instant; |
54a0048b | 25 | |
29967ef6 XL |
26 | use once_cell::sync::Lazy; |
27 | use regex::Regex; | |
28 | ||
3dfed10e XL |
29 | // Add linkcheck exceptions here |
30 | // If at all possible you should use intra-doc links to avoid linkcheck issues. These | |
31 | // are cases where that does not work | |
32 | // [(generated_documentation_page, &[broken_links])] | |
c295e0f8 | 33 | #[rustfmt::skip] |
3dfed10e | 34 | const LINKCHECK_EXCEPTIONS: &[(&str, &[&str])] = &[ |
3dfed10e XL |
35 | // These try to link to std::collections, but are defined in alloc |
36 | // https://github.com/rust-lang/rust/issues/74481 | |
37 | ("std/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]), | |
38 | ("std/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]), | |
39 | ("alloc/collections/btree_map/struct.BTreeMap.html", &["#insert-and-complex-keys"]), | |
40 | ("alloc/collections/btree_set/struct.BTreeSet.html", &["#insert-and-complex-keys"]), | |
c295e0f8 XL |
41 | |
42 | // These try to link to various things in std, but are defined in core. | |
43 | // The docs in std::primitive use proper intra-doc links, so these seem fine to special-case. | |
44 | // Most these are broken because liballoc uses `#[lang_item]` magic to define things on | |
45 | // primitives that aren't available in core. | |
46 | ("alloc/slice/trait.Join.html", &["#method.join"]), | |
47 | ("alloc/slice/trait.Concat.html", &["#method.concat"]), | |
48 | ("alloc/slice/index.html", &["#method.concat", "#method.join"]), | |
49 | ("alloc/vec/struct.Vec.html", &["#method.sort_by_key", "#method.sort_by_cached_key"]), | |
50 | ("core/primitive.str.html", &["#method.to_ascii_uppercase", "#method.to_ascii_lowercase"]), | |
51 | ("core/primitive.slice.html", &["#method.to_ascii_uppercase", "#method.to_ascii_lowercase", | |
52 | "core/slice::sort_by_key", "core\\slice::sort_by_key", | |
53 | "#method.sort_by_cached_key"]), | |
3dfed10e XL |
54 | ]; |
55 | ||
29967ef6 XL |
56 | #[rustfmt::skip] |
57 | const INTRA_DOC_LINK_EXCEPTIONS: &[(&str, &[&str])] = &[ | |
29967ef6 XL |
58 | // This is being used in the sense of 'inclusive range', not a markdown link |
59 | ("core/ops/struct.RangeInclusive.html", &["begin</code>, <code>end"]), | |
60 | ("std/ops/struct.RangeInclusive.html", &["begin</code>, <code>end"]), | |
61 | ("core/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]), | |
62 | ("alloc/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]), | |
63 | ("std/slice/trait.SliceIndex.html", &["begin</code>, <code>end"]), | |
3c0e092e XL |
64 | ("core/primitive.str.html", &["begin</code>, <code>end"]), |
65 | ("std/primitive.str.html", &["begin</code>, <code>end"]), | |
29967ef6 XL |
66 | |
67 | ]; | |
68 | ||
69 | static BROKEN_INTRA_DOC_LINK: Lazy<Regex> = | |
70 | Lazy::new(|| Regex::new(r#"\[<code>(.*)</code>\]"#).unwrap()); | |
71 | ||
54a0048b | 72 | macro_rules! t { |
dfeec247 XL |
73 | ($e:expr) => { |
74 | match $e { | |
75 | Ok(e) => e, | |
76 | Err(e) => panic!("{} failed with {:?}", stringify!($e), e), | |
77 | } | |
78 | }; | |
54a0048b SL |
79 | } |
80 | ||
81 | fn main() { | |
17df50a5 | 82 | let docs = env::args_os().nth(1).expect("doc path should be first argument"); |
54a0048b | 83 | let docs = env::current_dir().unwrap().join(docs); |
17df50a5 XL |
84 | let mut checker = Checker { root: docs.clone(), cache: HashMap::new() }; |
85 | let mut report = Report { | |
86 | errors: 0, | |
87 | start: Instant::now(), | |
88 | html_files: 0, | |
89 | html_redirects: 0, | |
90 | links_checked: 0, | |
91 | links_ignored_external: 0, | |
92 | links_ignored_exception: 0, | |
93 | intra_doc_exceptions: 0, | |
94 | }; | |
95 | checker.walk(&docs, &mut report); | |
96 | report.report(); | |
97 | if report.errors != 0 { | |
98 | println!("found some broken links"); | |
99 | std::process::exit(1); | |
54a0048b SL |
100 | } |
101 | } | |
102 | ||
17df50a5 XL |
103 | struct Checker { |
104 | root: PathBuf, | |
105 | cache: Cache, | |
54a0048b SL |
106 | } |
107 | ||
17df50a5 XL |
108 | struct Report { |
109 | errors: u32, | |
110 | start: Instant, | |
111 | html_files: u32, | |
112 | html_redirects: u32, | |
113 | links_checked: u32, | |
114 | links_ignored_external: u32, | |
115 | links_ignored_exception: u32, | |
116 | intra_doc_exceptions: u32, | |
54a0048b SL |
117 | } |
118 | ||
17df50a5 XL |
119 | /// A cache entry. |
120 | enum FileEntry { | |
121 | /// An HTML file. | |
122 | /// | |
123 | /// This includes the contents of the HTML file, and an optional set of | |
94222f64 | 124 | /// HTML IDs. The IDs are used for checking fragments. They are computed |
17df50a5 XL |
125 | /// as-needed. The source is discarded (replaced with an empty string) |
126 | /// after the file has been checked, to conserve on memory. | |
127 | HtmlFile { source: Rc<String>, ids: RefCell<HashSet<String>> }, | |
128 | /// This file is an HTML redirect to the given local path. | |
129 | Redirect { target: PathBuf }, | |
130 | /// This is not an HTML file. | |
131 | OtherFile, | |
132 | /// This is a directory. | |
133 | Dir, | |
134 | /// The file doesn't exist. | |
135 | Missing, | |
54a0048b SL |
136 | } |
137 | ||
17df50a5 XL |
138 | /// A cache to speed up file access. |
139 | type Cache = HashMap<String, FileEntry>; | |
54a0048b | 140 | |
abe05a73 XL |
141 | fn small_url_encode(s: &str) -> String { |
142 | s.replace("<", "%3C") | |
dfeec247 XL |
143 | .replace(">", "%3E") |
144 | .replace(" ", "%20") | |
145 | .replace("?", "%3F") | |
146 | .replace("'", "%27") | |
147 | .replace("&", "%26") | |
148 | .replace(",", "%2C") | |
149 | .replace(":", "%3A") | |
150 | .replace(";", "%3B") | |
151 | .replace("[", "%5B") | |
152 | .replace("]", "%5D") | |
153 | .replace("\"", "%22") | |
abe05a73 XL |
154 | } |
155 | ||
17df50a5 XL |
156 | impl Checker { |
157 | /// Primary entry point for walking the filesystem to find HTML files to check. | |
158 | fn walk(&mut self, dir: &Path, report: &mut Report) { | |
159 | for entry in t!(dir.read_dir()).map(|e| t!(e)) { | |
160 | let path = entry.path(); | |
5099ac24 FG |
161 | // Goes through symlinks |
162 | let metadata = t!(fs::metadata(&path)); | |
163 | if metadata.is_dir() { | |
17df50a5 XL |
164 | self.walk(&path, report); |
165 | } else { | |
166 | self.check(&path, report); | |
54a0048b SL |
167 | } |
168 | } | |
54a0048b | 169 | } |
29967ef6 | 170 | |
17df50a5 XL |
171 | /// Checks a single file. |
172 | fn check(&mut self, file: &Path, report: &mut Report) { | |
173 | let (pretty_path, entry) = self.load_file(file, report); | |
174 | let source = match entry { | |
175 | FileEntry::Missing => panic!("missing file {:?} while walking", file), | |
176 | FileEntry::Dir => unreachable!("never with `check` path"), | |
177 | FileEntry::OtherFile => return, | |
178 | FileEntry::Redirect { .. } => return, | |
179 | FileEntry::HtmlFile { source, ids } => { | |
180 | parse_ids(&mut ids.borrow_mut(), &pretty_path, source, report); | |
181 | source.clone() | |
6c58768f | 182 | } |
17df50a5 | 183 | }; |
cc61c64b | 184 | |
17df50a5 XL |
185 | // Search for anything that's the regex 'href[ ]*=[ ]*".*?"' |
186 | with_attrs_in_source(&source, " href", |url, i, base| { | |
187 | // Ignore external URLs | |
188 | if url.starts_with("http:") | |
189 | || url.starts_with("https:") | |
190 | || url.starts_with("javascript:") | |
191 | || url.starts_with("ftp:") | |
192 | || url.starts_with("irc:") | |
193 | || url.starts_with("data:") | |
2b03887a | 194 | || url.starts_with("mailto:") |
17df50a5 XL |
195 | { |
196 | report.links_ignored_external += 1; | |
197 | return; | |
198 | } | |
199 | report.links_checked += 1; | |
200 | let (url, fragment) = match url.split_once('#') { | |
201 | None => (url, None), | |
202 | Some((url, fragment)) => (url, Some(fragment)), | |
203 | }; | |
204 | // NB: the `splitn` always succeeds, even if the delimiter is not present. | |
205 | let url = url.splitn(2, '?').next().unwrap(); | |
206 | ||
207 | // Once we've plucked out the URL, parse it using our base url and | |
208 | // then try to extract a file path. | |
209 | let mut path = file.to_path_buf(); | |
210 | if !base.is_empty() || !url.is_empty() { | |
211 | path.pop(); | |
212 | for part in Path::new(base).join(url).components() { | |
213 | match part { | |
214 | Component::Prefix(_) | Component::RootDir => { | |
215 | // Avoid absolute paths as they make the docs not | |
216 | // relocatable by making assumptions on where the docs | |
217 | // are hosted relative to the site root. | |
218 | report.errors += 1; | |
219 | println!( | |
220 | "{}:{}: absolute path - {}", | |
221 | pretty_path, | |
222 | i + 1, | |
223 | Path::new(base).join(url).display() | |
224 | ); | |
225 | return; | |
226 | } | |
227 | Component::CurDir => {} | |
228 | Component::ParentDir => { | |
229 | path.pop(); | |
230 | } | |
231 | Component::Normal(s) => { | |
232 | path.push(s); | |
233 | } | |
234 | } | |
235 | } | |
236 | } | |
54a0048b | 237 | |
17df50a5 XL |
238 | let (target_pretty_path, target_entry) = self.load_file(&path, report); |
239 | let (target_source, target_ids) = match target_entry { | |
240 | FileEntry::Missing => { | |
241 | if is_exception(file, &target_pretty_path) { | |
242 | report.links_ignored_exception += 1; | |
243 | } else { | |
244 | report.errors += 1; | |
dfeec247 | 245 | println!( |
17df50a5 XL |
246 | "{}:{}: broken link - `{}`", |
247 | pretty_path, | |
dfeec247 | 248 | i + 1, |
17df50a5 | 249 | target_pretty_path |
dfeec247 | 250 | ); |
dfeec247 | 251 | } |
7cac9316 XL |
252 | return; |
253 | } | |
17df50a5 XL |
254 | FileEntry::Dir => { |
255 | // Links to directories show as directory listings when viewing | |
256 | // the docs offline so it's best to avoid them. | |
257 | report.errors += 1; | |
dfeec247 | 258 | println!( |
17df50a5 XL |
259 | "{}:{}: directory link to `{}` \ |
260 | (directory links should use index.html instead)", | |
261 | pretty_path, | |
dfeec247 | 262 | i + 1, |
17df50a5 | 263 | target_pretty_path |
dfeec247 | 264 | ); |
54a0048b SL |
265 | return; |
266 | } | |
17df50a5 XL |
267 | FileEntry::OtherFile => return, |
268 | FileEntry::Redirect { target } => { | |
269 | let t = target.clone(); | |
270 | drop(target); | |
271 | let (target, redir_entry) = self.load_file(&t, report); | |
272 | match redir_entry { | |
273 | FileEntry::Missing => { | |
274 | report.errors += 1; | |
275 | println!( | |
276 | "{}:{}: broken redirect from `{}` to `{}`", | |
277 | pretty_path, | |
278 | i + 1, | |
279 | target_pretty_path, | |
280 | target | |
281 | ); | |
282 | return; | |
283 | } | |
284 | FileEntry::Redirect { target } => { | |
285 | // Redirect to a redirect, this link checker | |
286 | // currently doesn't support this, since it would | |
287 | // require cycle checking, etc. | |
288 | report.errors += 1; | |
289 | println!( | |
290 | "{}:{}: redirect from `{}` to `{}` \ | |
291 | which is also a redirect (not supported)", | |
292 | pretty_path, | |
293 | i + 1, | |
294 | target_pretty_path, | |
295 | target.display() | |
296 | ); | |
297 | return; | |
298 | } | |
299 | FileEntry::Dir => { | |
300 | report.errors += 1; | |
301 | println!( | |
302 | "{}:{}: redirect from `{}` to `{}` \ | |
303 | which is a directory \ | |
304 | (directory links should use index.html instead)", | |
305 | pretty_path, | |
306 | i + 1, | |
307 | target_pretty_path, | |
308 | target | |
309 | ); | |
310 | return; | |
311 | } | |
312 | FileEntry::OtherFile => return, | |
313 | FileEntry::HtmlFile { source, ids } => (source, ids), | |
314 | } | |
315 | } | |
316 | FileEntry::HtmlFile { source, ids } => (source, ids), | |
54a0048b SL |
317 | }; |
318 | ||
17df50a5 XL |
319 | // Alright, if we've found an HTML file for the target link. If |
320 | // this is a fragment link, also check that the `id` exists. | |
476ff2be | 321 | if let Some(ref fragment) = fragment { |
54a0048b SL |
322 | // Fragments like `#1-6` are most likely line numbers to be |
323 | // interpreted by javascript, so we're ignoring these | |
dfeec247 | 324 | if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) { |
54a0048b SL |
325 | return; |
326 | } | |
327 | ||
17df50a5 XL |
328 | parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report); |
329 | ||
330 | if target_ids.borrow().contains(*fragment) { | |
331 | return; | |
332 | } | |
54a0048b | 333 | |
17df50a5 XL |
334 | if is_exception(file, &format!("#{}", fragment)) { |
335 | report.links_ignored_exception += 1; | |
336 | } else { | |
337 | report.errors += 1; | |
338 | print!("{}:{}: broken link fragment ", pretty_path, i + 1); | |
136023e0 | 339 | println!("`#{}` pointing to `{}`", fragment, target_pretty_path); |
54a0048b SL |
340 | }; |
341 | } | |
17df50a5 XL |
342 | }); |
343 | ||
6522a427 EL |
344 | self.check_intra_doc_links(file, &pretty_path, &source, report); |
345 | ||
346 | // we don't need the source anymore, | |
347 | // so drop to reduce memory-usage | |
348 | match self.cache.get_mut(&pretty_path).unwrap() { | |
349 | FileEntry::HtmlFile { source, .. } => *source = Rc::new(String::new()), | |
350 | _ => unreachable!("must be html file"), | |
351 | } | |
352 | } | |
353 | ||
354 | fn check_intra_doc_links( | |
355 | &mut self, | |
356 | file: &Path, | |
357 | pretty_path: &str, | |
358 | source: &str, | |
359 | report: &mut Report, | |
360 | ) { | |
361 | let relative = file.strip_prefix(&self.root).expect("should always be relative to root"); | |
362 | // Don't check the reference. It has several legitimate things that | |
363 | // look like [<code>…</code>]. The reference has its own broken link | |
364 | // checker in its CI which handles this using pulldown_cmark. | |
365 | // | |
366 | // This checks both the end of the root (when checking just the | |
367 | // reference directory) or the beginning (when checking all docs). | |
368 | if self.root.ends_with("reference") || relative.starts_with("reference") { | |
369 | return; | |
370 | } | |
17df50a5 XL |
371 | // Search for intra-doc links that rustdoc didn't warn about |
372 | // FIXME(#77199, 77200) Rustdoc should just warn about these directly. | |
373 | // NOTE: only looks at one line at a time; in practice this should find most links | |
374 | for (i, line) in source.lines().enumerate() { | |
375 | for broken_link in BROKEN_INTRA_DOC_LINK.captures_iter(line) { | |
376 | if is_intra_doc_exception(file, &broken_link[1]) { | |
377 | report.intra_doc_exceptions += 1; | |
378 | } else { | |
379 | report.errors += 1; | |
380 | print!("{}:{}: broken intra-doc link - ", pretty_path, i + 1); | |
381 | println!("{}", &broken_link[0]); | |
382 | } | |
3dfed10e | 383 | } |
54a0048b | 384 | } |
29967ef6 | 385 | } |
54a0048b | 386 | |
17df50a5 XL |
387 | /// Load a file from disk, or from the cache if available. |
388 | fn load_file(&mut self, file: &Path, report: &mut Report) -> (String, &FileEntry) { | |
c295e0f8 XL |
389 | // https://docs.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499- |
390 | #[cfg(windows)] | |
391 | const ERROR_INVALID_NAME: i32 = 123; | |
392 | ||
17df50a5 XL |
393 | let pretty_path = |
394 | file.strip_prefix(&self.root).unwrap_or(&file).to_str().unwrap().to_string(); | |
395 | ||
396 | let entry = | |
397 | self.cache.entry(pretty_path.clone()).or_insert_with(|| match fs::metadata(file) { | |
398 | Ok(metadata) if metadata.is_dir() => FileEntry::Dir, | |
399 | Ok(_) => { | |
400 | if file.extension().and_then(|s| s.to_str()) != Some("html") { | |
401 | FileEntry::OtherFile | |
a1dfa0c6 | 402 | } else { |
17df50a5 XL |
403 | report.html_files += 1; |
404 | load_html_file(file, report) | |
405 | } | |
54a0048b | 406 | } |
17df50a5 XL |
407 | Err(e) if e.kind() == ErrorKind::NotFound => FileEntry::Missing, |
408 | Err(e) => { | |
c295e0f8 XL |
409 | // If a broken intra-doc link contains `::`, on windows, it will cause `ERROR_INVALID_NAME` rather than `NotFound`. |
410 | // Explicitly check for that so that the broken link can be allowed in `LINKCHECK_EXCEPTIONS`. | |
411 | #[cfg(windows)] | |
412 | if e.raw_os_error() == Some(ERROR_INVALID_NAME) | |
413 | && file.as_os_str().to_str().map_or(false, |s| s.contains("::")) | |
414 | { | |
415 | return FileEntry::Missing; | |
416 | } | |
17df50a5 | 417 | panic!("unexpected read error for {}: {}", file.display(), e); |
54a0048b | 418 | } |
17df50a5 XL |
419 | }); |
420 | (pretty_path, entry) | |
421 | } | |
422 | } | |
423 | ||
424 | impl Report { | |
425 | fn report(&self) { | |
426 | println!("checked links in: {:.1}s", self.start.elapsed().as_secs_f64()); | |
427 | println!("number of HTML files scanned: {}", self.html_files); | |
428 | println!("number of HTML redirects found: {}", self.html_redirects); | |
429 | println!("number of links checked: {}", self.links_checked); | |
430 | println!("number of links ignored due to external: {}", self.links_ignored_external); | |
431 | println!("number of links ignored due to exceptions: {}", self.links_ignored_exception); | |
432 | println!("number of intra doc links ignored: {}", self.intra_doc_exceptions); | |
433 | println!("errors found: {}", self.errors); | |
434 | } | |
435 | } | |
436 | ||
437 | fn load_html_file(file: &Path, report: &mut Report) -> FileEntry { | |
438 | let source = match fs::read_to_string(file) { | |
439 | Ok(s) => Rc::new(s), | |
440 | Err(err) => { | |
441 | // This usually should not fail since `metadata` was already | |
442 | // called successfully on this file. | |
443 | panic!("unexpected read error for {}: {}", file.display(), err); | |
3157f602 | 444 | } |
54a0048b | 445 | }; |
17df50a5 XL |
446 | match maybe_redirect(&source) { |
447 | Some(target) => { | |
448 | report.html_redirects += 1; | |
449 | let target = file.parent().unwrap().join(target); | |
450 | FileEntry::Redirect { target } | |
451 | } | |
452 | None => FileEntry::HtmlFile { source: source.clone(), ids: RefCell::new(HashSet::new()) }, | |
453 | } | |
454 | } | |
455 | ||
456 | fn is_intra_doc_exception(file: &Path, link: &str) -> bool { | |
457 | if let Some(entry) = INTRA_DOC_LINK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) { | |
458 | entry.1.is_empty() || entry.1.contains(&link) | |
459 | } else { | |
460 | false | |
54a0048b SL |
461 | } |
462 | } | |
463 | ||
17df50a5 XL |
464 | fn is_exception(file: &Path, link: &str) -> bool { |
465 | if let Some(entry) = LINKCHECK_EXCEPTIONS.iter().find(|&(f, _)| file.ends_with(f)) { | |
466 | entry.1.contains(&link) | |
467 | } else { | |
468 | // FIXME(#63351): Concat trait in alloc/slice reexported in primitive page | |
469 | // | |
470 | // NOTE: This cannot be added to `LINKCHECK_EXCEPTIONS` because the resolved path | |
471 | // calculated in `check` function is outside `build/<triple>/doc` dir. | |
472 | // So the `strip_prefix` method just returns the old absolute broken path. | |
473 | if file.ends_with("std/primitive.slice.html") { | |
474 | if link.ends_with("primitive.slice.html") { | |
475 | return true; | |
476 | } | |
477 | } | |
478 | false | |
479 | } | |
480 | } | |
481 | ||
482 | /// If the given HTML file contents is an HTML redirect, this returns the | |
483 | /// destination path given in the redirect. | |
54a0048b | 484 | fn maybe_redirect(source: &str) -> Option<String> { |
5e7ed085 FG |
485 | const REDIRECT_RUSTDOC: (usize, &str) = (7, "<p>Redirecting to <a href="); |
486 | const REDIRECT_MDBOOK: (usize, &str) = (8 - 7, "<p>Redirecting to... <a href="); | |
54a0048b SL |
487 | |
488 | let mut lines = source.lines(); | |
54a0048b | 489 | |
5e7ed085 FG |
490 | let mut find_redirect = |(line_rel, redirect_pattern): (usize, &str)| { |
491 | let redirect_line = lines.nth(line_rel)?; | |
492 | ||
493 | redirect_line.find(redirect_pattern).map(|i| { | |
494 | let rest = &redirect_line[(i + redirect_pattern.len() + 1)..]; | |
495 | let pos_quote = rest.find('"').unwrap(); | |
496 | rest[..pos_quote].to_owned() | |
497 | }) | |
498 | }; | |
499 | ||
500 | find_redirect(REDIRECT_RUSTDOC).or_else(|| find_redirect(REDIRECT_MDBOOK)) | |
54a0048b SL |
501 | } |
502 | ||
17df50a5 | 503 | fn with_attrs_in_source<F: FnMut(&str, usize, &str)>(source: &str, attr: &str, mut f: F) { |
7cac9316 | 504 | let mut base = ""; |
17df50a5 | 505 | for (i, mut line) in source.lines().enumerate() { |
54a0048b | 506 | while let Some(j) = line.find(attr) { |
3157f602 | 507 | let rest = &line[j + attr.len()..]; |
7cac9316 XL |
508 | // The base tag should always be the first link in the document so |
509 | // we can get away with using one pass. | |
510 | let is_base = line[..j].ends_with("<base"); | |
54a0048b | 511 | line = rest; |
3dfed10e | 512 | let pos_equals = match rest.find('=') { |
54a0048b SL |
513 | Some(i) => i, |
514 | None => continue, | |
515 | }; | |
3dfed10e | 516 | if rest[..pos_equals].trim_start_matches(' ') != "" { |
3157f602 | 517 | continue; |
54a0048b SL |
518 | } |
519 | ||
520 | let rest = &rest[pos_equals + 1..]; | |
521 | ||
522 | let pos_quote = match rest.find(&['"', '\''][..]) { | |
523 | Some(i) => i, | |
524 | None => continue, | |
525 | }; | |
526 | let quote_delim = rest.as_bytes()[pos_quote] as char; | |
527 | ||
3dfed10e | 528 | if rest[..pos_quote].trim_start_matches(' ') != "" { |
3157f602 | 529 | continue; |
54a0048b SL |
530 | } |
531 | let rest = &rest[pos_quote + 1..]; | |
532 | let url = match rest.find(quote_delim) { | |
533 | Some(i) => &rest[..i], | |
534 | None => continue, | |
535 | }; | |
7cac9316 XL |
536 | if is_base { |
537 | base = url; | |
538 | continue; | |
539 | } | |
540 | f(url, i, base) | |
54a0048b SL |
541 | } |
542 | } | |
543 | } | |
17df50a5 XL |
544 | |
545 | fn parse_ids(ids: &mut HashSet<String>, file: &str, source: &str, report: &mut Report) { | |
546 | if ids.is_empty() { | |
547 | with_attrs_in_source(source, " id", |fragment, i, _| { | |
548 | let frag = fragment.trim_start_matches("#").to_owned(); | |
549 | let encoded = small_url_encode(&frag); | |
550 | if !ids.insert(frag) { | |
551 | report.errors += 1; | |
552 | println!("{}:{}: id is not unique: `{}`", file, i, fragment); | |
553 | } | |
554 | // Just in case, we also add the encoded id. | |
555 | ids.insert(encoded); | |
556 | }); | |
557 | } | |
558 | } |