]> git.proxmox.com Git - rustc.git/blame - vendor/pulldown-cmark-0.7.2/src/escape.rs
New upstream version 1.58.1+dfsg1
[rustc.git] / vendor / pulldown-cmark-0.7.2 / src / escape.rs
CommitLineData
1b1a35ee
XL
1// Copyright 2015 Google Inc. All rights reserved.\r
2//\r
3// Permission is hereby granted, free of charge, to any person obtaining a copy\r
4// of this software and associated documentation files (the "Software"), to deal\r
5// in the Software without restriction, including without limitation the rights\r
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r
7// copies of the Software, and to permit persons to whom the Software is\r
8// furnished to do so, subject to the following conditions:\r
9//\r
10// The above copyright notice and this permission notice shall be included in\r
11// all copies or substantial portions of the Software.\r
12//\r
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\r
19// THE SOFTWARE.\r
20\r
21//! Utility functions for HTML escaping\r
22\r
23use std::io;\r
24use std::str::from_utf8;\r
25\r
26use crate::html::StrWrite;\r
27\r
28#[rustfmt::skip]\r
29static HREF_SAFE: [u8; 128] = [\r
30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
32 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,\r
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,\r
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,\r
36 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,\r
38];\r
39\r
40static HEX_CHARS: &[u8] = b"0123456789ABCDEF";\r
41static AMP_ESCAPE: &str = "&";\r
42static SLASH_ESCAPE: &str = "'";\r
43\r
44pub(crate) fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>\r
45where\r
46 W: StrWrite,\r
47{\r
48 let bytes = s.as_bytes();\r
49 let mut mark = 0;\r
50 for i in 0..bytes.len() {\r
51 let c = bytes[i];\r
52 if c >= 0x80 || HREF_SAFE[c as usize] == 0 {\r
53 // character needing escape\r
54\r
55 // write partial substring up to mark\r
56 if mark < i {\r
57 w.write_str(&s[mark..i])?;\r
58 }\r
59 match c {\r
60 b'&' => {\r
61 w.write_str(AMP_ESCAPE)?;\r
62 }\r
63 b'\'' => {\r
64 w.write_str(SLASH_ESCAPE)?;\r
65 }\r
66 _ => {\r
67 let mut buf = [0u8; 3];\r
68 buf[0] = b'%';\r
69 buf[1] = HEX_CHARS[((c as usize) >> 4) & 0xF];\r
70 buf[2] = HEX_CHARS[(c as usize) & 0xF];\r
71 let escaped = from_utf8(&buf).unwrap();\r
72 w.write_str(escaped)?;\r
73 }\r
74 }\r
75 mark = i + 1; // all escaped characters are ASCII\r
76 }\r
77 }\r
78 w.write_str(&s[mark..])\r
79}\r
80\r
81const fn create_html_escape_table() -> [u8; 256] {\r
82 let mut table = [0; 256];\r
83 table[b'"' as usize] = 1;\r
84 table[b'&' as usize] = 2;\r
85 table[b'<' as usize] = 3;\r
86 table[b'>' as usize] = 4;\r
87 table\r
88}\r
89\r
90static HTML_ESCAPE_TABLE: [u8; 256] = create_html_escape_table();\r
91\r
92static HTML_ESCAPES: [&'static str; 5] = ["", "&quot;", "&amp;", "&lt;", "&gt;"];\r
93\r
94/// Writes the given string to the Write sink, replacing special HTML bytes\r
95/// (<, >, &, ") by escape sequences.\r
96pub(crate) fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {\r
97 #[cfg(all(target_arch = "x86_64", feature = "simd"))]\r
98 {\r
99 simd::escape_html(w, s)\r
100 }\r
101 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]\r
102 {\r
103 escape_html_scalar(w, s)\r
104 }\r
105}\r
106\r
107fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {\r
108 let bytes = s.as_bytes();\r
109 let mut mark = 0;\r
110 let mut i = 0;\r
111 while i < s.len() {\r
112 match bytes[i..]\r
113 .iter()\r
114 .position(|&c| HTML_ESCAPE_TABLE[c as usize] != 0)\r
115 {\r
116 Some(pos) => {\r
117 i += pos;\r
118 }\r
119 None => break,\r
120 }\r
121 let c = bytes[i];\r
122 let escape = HTML_ESCAPE_TABLE[c as usize];\r
123 let escape_seq = HTML_ESCAPES[escape as usize];\r
124 w.write_str(&s[mark..i])?;\r
125 w.write_str(escape_seq)?;\r
126 i += 1;\r
127 mark = i; // all escaped characters are ASCII\r
128 }\r
129 w.write_str(&s[mark..])\r
130}\r
131\r
132#[cfg(all(target_arch = "x86_64", feature = "simd"))]\r
133mod simd {\r
134 use crate::html::StrWrite;\r
135 use std::arch::x86_64::*;\r
136 use std::io;\r
137 use std::mem::size_of;\r
138\r
139 const VECTOR_SIZE: usize = size_of::<__m128i>();\r
140\r
141 pub(crate) fn escape_html<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {\r
142 // The SIMD accelerated code uses the PSHUFB instruction, which is part\r
143 // of the SSSE3 instruction set. Further, we can only use this code if\r
144 // the buffer is at least one VECTOR_SIZE in length to prevent reading\r
145 // out of bounds. If either of these conditions is not met, we fall back\r
146 // to scalar code.\r
147 if is_x86_feature_detected!("ssse3") && s.len() >= VECTOR_SIZE {\r
148 let bytes = s.as_bytes();\r
149 let mut mark = 0;\r
150\r
151 unsafe {\r
152 foreach_special_simd(bytes, 0, |i| {\r
153 let escape_ix = *bytes.get_unchecked(i) as usize;\r
154 let replacement =\r
155 super::HTML_ESCAPES[super::HTML_ESCAPE_TABLE[escape_ix] as usize];\r
156 w.write_str(&s.get_unchecked(mark..i))?;\r
157 mark = i + 1; // all escaped characters are ASCII\r
158 w.write_str(replacement)\r
159 })?;\r
160 w.write_str(&s.get_unchecked(mark..))\r
161 }\r
162 } else {\r
163 super::escape_html_scalar(w, s)\r
164 }\r
165 }\r
166\r
167 /// Creates the lookup table for use in `compute_mask`.\r
168 const fn create_lookup() -> [u8; 16] {\r
169 let mut table = [0; 16];\r
170 table[(b'<' & 0x0f) as usize] = b'<';\r
171 table[(b'>' & 0x0f) as usize] = b'>';\r
172 table[(b'&' & 0x0f) as usize] = b'&';\r
173 table[(b'"' & 0x0f) as usize] = b'"';\r
174 table[0] = 0b0111_1111;\r
175 table\r
176 }\r
177\r
178 #[target_feature(enable = "ssse3")]\r
179 /// Computes a byte mask at given offset in the byte buffer. Its first 16 (least significant)\r
180 /// bits correspond to whether there is an HTML special byte (&, <, ", >) at the 16 bytes\r
181 /// `bytes[offset..]`. For example, the mask `(1 << 3)` states that there is an HTML byte\r
182 /// at `offset + 3`. It is only safe to call this function when\r
183 /// `bytes.len() >= offset + VECTOR_SIZE`.\r
184 unsafe fn compute_mask(bytes: &[u8], offset: usize) -> i32 {\r
185 debug_assert!(bytes.len() >= offset + VECTOR_SIZE);\r
186\r
187 let table = create_lookup();\r
188 let lookup = _mm_loadu_si128(table.as_ptr() as *const __m128i);\r
189 let raw_ptr = bytes.as_ptr().offset(offset as isize) as *const __m128i;\r
190\r
191 // Load the vector from memory.\r
192 let vector = _mm_loadu_si128(raw_ptr);\r
193 // We take the least significant 4 bits of every byte and use them as indices\r
194 // to map into the lookup vector.\r
195 // Note that shuffle maps bytes with their most significant bit set to lookup[0].\r
196 // Bytes that share their lower nibble with an HTML special byte get mapped to that\r
197 // corresponding special byte. Note that all HTML special bytes have distinct lower\r
198 // nibbles. Other bytes either get mapped to 0 or 127.\r
199 let expected = _mm_shuffle_epi8(lookup, vector);\r
200 // We compare the original vector to the mapped output. Bytes that shared a lower\r
201 // nibble with an HTML special byte match *only* if they are that special byte. Bytes\r
202 // that have either a 0 lower nibble or their most significant bit set were mapped to\r
203 // 127 and will hence never match. All other bytes have non-zero lower nibbles but\r
204 // were mapped to 0 and will therefore also not match.\r
205 let matches = _mm_cmpeq_epi8(expected, vector);\r
206\r
207 // Translate matches to a bitmask, where every 1 corresponds to a HTML special character\r
208 // and a 0 is a non-HTML byte.\r
209 _mm_movemask_epi8(matches)\r
210 }\r
211\r
212 /// Calls the given function with the index of every byte in the given byteslice\r
213 /// that is either ", &, <, or > and for no other byte.\r
214 /// Make sure to only call this when `bytes.len() >= 16`, undefined behaviour may\r
215 /// occur otherwise.\r
216 #[target_feature(enable = "ssse3")]\r
217 unsafe fn foreach_special_simd<F>(\r
218 bytes: &[u8],\r
219 mut offset: usize,\r
220 mut callback: F,\r
221 ) -> io::Result<()>\r
222 where\r
223 F: FnMut(usize) -> io::Result<()>,\r
224 {\r
225 // The strategy here is to walk the byte buffer in chunks of VECTOR_SIZE (16)\r
226 // bytes at a time starting at the given offset. For each chunk, we compute a\r
227 // a bitmask indicating whether the corresponding byte is a HTML special byte.\r
228 // We then iterate over all the 1 bits in this mask and call the callback function\r
229 // with the corresponding index in the buffer.\r
230 // When the number of HTML special bytes in the buffer is relatively low, this\r
231 // allows us to quickly go through the buffer without a lookup and for every\r
232 // single byte.\r
233\r
234 debug_assert!(bytes.len() >= VECTOR_SIZE);\r
235 let upperbound = bytes.len() - VECTOR_SIZE;\r
236 while offset < upperbound {\r
237 let mut mask = compute_mask(bytes, offset);\r
238 while mask != 0 {\r
239 let ix = mask.trailing_zeros();\r
240 callback(offset + ix as usize)?;\r
241 mask ^= mask & -mask;\r
242 }\r
243 offset += VECTOR_SIZE;\r
244 }\r
245\r
246 // Final iteration. We align the read with the end of the slice and\r
247 // shift off the bytes at start we have already scanned.\r
248 let mut mask = compute_mask(bytes, upperbound);\r
249 mask >>= offset - upperbound;\r
250 while mask != 0 {\r
251 let ix = mask.trailing_zeros();\r
252 callback(offset + ix as usize)?;\r
253 mask ^= mask & -mask;\r
254 }\r
255 Ok(())\r
256 }\r
257\r
258 #[cfg(test)]\r
259 mod html_scan_tests {\r
260 #[test]\r
261 fn multichunk() {\r
262 let mut vec = Vec::new();\r
263 unsafe {\r
264 super::foreach_special_simd("&aXaaaa.a'aa9a<>aab&".as_bytes(), 0, |ix| {\r
265 Ok(vec.push(ix))\r
266 })\r
267 .unwrap();\r
268 }\r
269 assert_eq!(vec, vec![0, 14, 15, 19]);\r
270 }\r
271\r
272 // only match these bytes, and when we match them, match them VECTOR_SIZE times\r
273 #[test]\r
274 fn only_right_bytes_matched() {\r
275 for b in 0..255u8 {\r
276 let right_byte = b == b'&' || b == b'<' || b == b'>' || b == b'"';\r
277 let vek = vec![b; super::VECTOR_SIZE];\r
278 let mut match_count = 0;\r
279 unsafe {\r
280 super::foreach_special_simd(&vek, 0, |_| {\r
281 match_count += 1;\r
282 Ok(())\r
283 })\r
284 .unwrap();\r
285 }\r
286 assert!((match_count > 0) == (match_count == super::VECTOR_SIZE));\r
287 assert_eq!(\r
288 (match_count == super::VECTOR_SIZE),\r
289 right_byte,\r
290 "match_count: {}, byte: {:?}",\r
291 match_count,\r
292 b as char\r
293 );\r
294 }\r
295 }\r
296 }\r
297}\r