]> git.proxmox.com Git - rustc.git/blob - vendor/pulldown-cmark-0.5.3/src/escape.rs
New upstream version 1.42.0+dfsg1
[rustc.git] / vendor / pulldown-cmark-0.5.3 / src / escape.rs
1 // Copyright 2015 Google Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20
21 //! Utility functions for HTML escaping
22
23 use std::io;
24 use std::str::from_utf8;
25
26 use crate::html::StrWrite;
27
28 #[rustfmt::skip]
29 static HREF_SAFE: [u8; 128] = [
30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
36 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
38 ];
39
40 static HEX_CHARS: &[u8] = b"0123456789ABCDEF";
41 static AMP_ESCAPE: &str = "&";
42 static SLASH_ESCAPE: &str = "'";
43
44 pub(crate) fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>
45 where
46 W: StrWrite,
47 {
48 let bytes = s.as_bytes();
49 let mut mark = 0;
50 for i in 0..bytes.len() {
51 let c = bytes[i];
52 if c >= 0x80 || HREF_SAFE[c as usize] == 0 {
53 // character needing escape
54
55 // write partial substring up to mark
56 if mark < i {
57 w.write_str(&s[mark..i])?;
58 }
59 match c {
60 b'&' => {
61 w.write_str(AMP_ESCAPE)?;
62 }
63 b'\'' => {
64 w.write_str(SLASH_ESCAPE)?;
65 }
66 _ => {
67 let mut buf = [0u8; 3];
68 buf[0] = b'%';
69 buf[1] = HEX_CHARS[((c as usize) >> 4) & 0xF];
70 buf[2] = HEX_CHARS[(c as usize) & 0xF];
71 let escaped = from_utf8(&buf).unwrap();
72 w.write_str(escaped)?;
73 }
74 }
75 mark = i + 1; // all escaped characters are ASCII
76 }
77 }
78 w.write_str(&s[mark..])
79 }
80
81 #[rustfmt::skip]
82 static HTML_ESCAPE_TABLE: [u8; 256] = [
83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0,
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 ];
100
101 static HTML_ESCAPES: [&str; 5] = ["", "&quot;", "&amp;", "&lt;", "&gt;"];
102
103 /// Writes the given string to the Write sink, replacing special HTML bytes
104 /// (<, >, &, ") by escape sequences.
105 pub(crate) fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {
106 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
107 {
108 simd::escape_html(w, s)
109 }
110 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
111 {
112 escape_html_scalar(w, s)
113 }
114 }
115
116 fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {
117 let bytes = s.as_bytes();
118 let mut mark = 0;
119 let mut i = 0;
120 while i < s.len() {
121 match bytes[i..]
122 .iter()
123 .position(|&c| HTML_ESCAPE_TABLE[c as usize] != 0)
124 {
125 Some(pos) => {
126 i += pos;
127 }
128 None => break,
129 }
130 let c = bytes[i];
131 let escape = HTML_ESCAPE_TABLE[c as usize];
132 if escape != 0 {
133 let escape_seq = HTML_ESCAPES[escape as usize];
134 w.write_str(&s[mark..i])?;
135 w.write_str(escape_seq)?;
136 mark = i + 1; // all escaped characters are ASCII
137 }
138 i += 1;
139 }
140 w.write_str(&s[mark..])
141 }
142
143 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
144 mod simd {
145 use crate::html::StrWrite;
146 use std::arch::x86_64::*;
147 use std::io;
148 use std::mem::size_of;
149
150 const VECTOR_SIZE: usize = size_of::<__m128i>();
151
152 pub(crate) fn escape_html<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {
153 // The SIMD accelerated code uses the PSHUFB instruction, which is part
154 // of the SSSE3 instruction set. Further, we can only use this code if
155 // the buffer is at least one VECTOR_SIZE in length to prevent reading
156 // out of bounds. If either of these conditions is not met, we fall back
157 // to scalar code.
158 if is_x86_feature_detected!("ssse3") && s.len() >= VECTOR_SIZE {
159 let bytes = s.as_bytes();
160 let mut mark = 0;
161
162 unsafe {
163 foreach_special_simd(bytes, 0, |i| {
164 let escape_ix = *bytes.get_unchecked(i) as usize;
165 let replacement =
166 super::HTML_ESCAPES[super::HTML_ESCAPE_TABLE[escape_ix] as usize];
167 w.write_str(&s.get_unchecked(mark..i))?;
168 mark = i + 1; // all escaped characters are ASCII
169 w.write_str(replacement)
170 })?;
171 w.write_str(&s.get_unchecked(mark..))
172 }
173 } else {
174 super::escape_html_scalar(w, s)
175 }
176 }
177
178 /// Creates the lookup table for use in `compute_mask`.
179 const fn create_lookup() -> [u8; 16] {
180 let mut table = [0; 16];
181 table[(b'<' & 0x0f) as usize] = b'<';
182 table[(b'>' & 0x0f) as usize] = b'>';
183 table[(b'&' & 0x0f) as usize] = b'&';
184 table[(b'"' & 0x0f) as usize] = b'"';
185 table[0] = 0b0111_1111;
186 table
187 }
188
189 #[target_feature(enable = "ssse3")]
190 /// Computes a byte mask at given offset in the byte buffer. Its first 16 (least significant)
191 /// bits correspond to whether there is an HTML special byte (&, <, ", >) at the 16 bytes
192 /// `bytes[offset..]`. For example, the mask `(1 << 3)` states that there is an HTML byte
193 /// at `offset + 3`. It is only safe to call this function when
194 /// `bytes.len() >= offset + VECTOR_SIZE`.
195 unsafe fn compute_mask(bytes: &[u8], offset: usize) -> i32 {
196 debug_assert!(bytes.len() >= offset + VECTOR_SIZE);
197
198 let table = create_lookup();
199 let lookup = _mm_loadu_si128(table.as_ptr() as *const __m128i);
200 let raw_ptr = bytes.as_ptr().offset(offset as isize) as *const __m128i;
201
202 // Load the vector from memory.
203 let vector = _mm_loadu_si128(raw_ptr);
204 // We take the least significant 4 bits of every byte and use them as indices
205 // to map into the lookup vector.
206 // Note that shuffle maps bytes with their most significant bit set to lookup[0].
207 // Bytes that share their lower nibble with an HTML special byte get mapped to that
208 // corresponding special byte. Note that all HTML special bytes have distinct lower
209 // nibbles. Other bytes either get mapped to 0 or 127.
210 let expected = _mm_shuffle_epi8(lookup, vector);
211 // We compare the original vector to the mapped output. Bytes that shared a lower
212 // nibble with an HTML special byte match *only* if they are that special byte. Bytes
213 // that have either a 0 lower nibble or their most significant bit set were mapped to
214 // 127 and will hence never match. All other bytes have non-zero lower nibbles but
215 // were mapped to 0 and will therefore also not match.
216 let matches = _mm_cmpeq_epi8(expected, vector);
217
218 // Translate matches to a bitmask, where every 1 corresponds to a HTML special character
219 // and a 0 is a non-HTML byte.
220 _mm_movemask_epi8(matches)
221 }
222
223 /// Calls the given function with the index of every byte in the given byteslice
224 /// that is either ", &, <, or > and for no other byte.
225 /// Make sure to only call this when `bytes.len() >= 16`, undefined behaviour may
226 /// occur otherwise.
227 #[target_feature(enable = "ssse3")]
228 unsafe fn foreach_special_simd<F>(
229 bytes: &[u8],
230 mut offset: usize,
231 mut callback: F,
232 ) -> io::Result<()>
233 where
234 F: FnMut(usize) -> io::Result<()>,
235 {
236 // The strategy here is to walk the byte buffer in chunks of VECTOR_SIZE (16)
237 // bytes at a time starting at the given offset. For each chunk, we compute a
238 // a bitmask indicating whether the corresponding byte is a HTML special byte.
239 // We then iterate over all the 1 bits in this mask and call the callback function
240 // with the corresponding index in the buffer.
241 // When the number of HTML special bytes in the buffer is relatively low, this
242 // allows us to quickly go through the buffer without a lookup and for every
243 // single byte.
244
245 debug_assert!(bytes.len() >= VECTOR_SIZE);
246 let upperbound = bytes.len() - VECTOR_SIZE;
247 while offset < upperbound {
248 let mut mask = compute_mask(bytes, offset);
249 while mask != 0 {
250 let ix = mask.trailing_zeros();
251 callback(offset + ix as usize)?;
252 mask ^= mask & -mask;
253 }
254 offset += VECTOR_SIZE;
255 }
256
257 // Final iteration. We align the read with the end of the slice and
258 // shift off the bytes at start we have already scanned.
259 let mut mask = compute_mask(bytes, upperbound);
260 mask >>= offset - upperbound;
261 while mask != 0 {
262 let ix = mask.trailing_zeros();
263 callback(offset + ix as usize)?;
264 mask ^= mask & -mask;
265 }
266 Ok(())
267 }
268
269 #[cfg(test)]
270 mod html_scan_tests {
271 #[test]
272 fn multichunk() {
273 let mut vec = Vec::new();
274 unsafe {
275 super::foreach_special_simd("&aXaaaa.a'aa9a<>aab&".as_bytes(), 0, |ix| {
276 Ok(vec.push(ix))
277 })
278 .unwrap();
279 }
280 assert_eq!(vec, vec![0, 14, 15, 19]);
281 }
282
283 // only match these bytes, and when we match them, match them VECTOR_SIZE times
284 #[test]
285 fn only_right_bytes_matched() {
286 for b in 0..255u8 {
287 let right_byte = b == b'&' || b == b'<' || b == b'>' || b == b'"';
288 let vek = vec![b; super::VECTOR_SIZE];
289 let mut match_count = 0;
290 unsafe {
291 super::foreach_special_simd(&vek, 0, |_| {
292 match_count += 1;
293 Ok(())
294 })
295 .unwrap();
296 }
297 assert!((match_count > 0) == (match_count == super::VECTOR_SIZE));
298 assert_eq!(
299 (match_count == super::VECTOR_SIZE),
300 right_byte,
301 "match_count: {}, byte: {:?}",
302 match_count,
303 b as char
304 );
305 }
306 }
307 }
308 }