]>
Commit | Line | Data |
---|---|---|
1b1a35ee XL |
1 | // Copyright 2015 Google Inc. All rights reserved.\r |
2 | //\r | |
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy\r | |
4 | // of this software and associated documentation files (the "Software"), to deal\r | |
5 | // in the Software without restriction, including without limitation the rights\r | |
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r | |
7 | // copies of the Software, and to permit persons to whom the Software is\r | |
8 | // furnished to do so, subject to the following conditions:\r | |
9 | //\r | |
10 | // The above copyright notice and this permission notice shall be included in\r | |
11 | // all copies or substantial portions of the Software.\r | |
12 | //\r | |
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r | |
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r | |
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r | |
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r | |
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r | |
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\r | |
19 | // THE SOFTWARE.\r | |
20 | \r | |
21 | //! Utility functions for HTML escaping\r | |
22 | \r | |
23 | use std::io;\r | |
24 | use std::str::from_utf8;\r | |
25 | \r | |
26 | use crate::html::StrWrite;\r | |
27 | \r | |
28 | #[rustfmt::skip]\r | |
29 | static HREF_SAFE: [u8; 128] = [\r | |
30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r | |
31 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r | |
32 | 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,\r | |
33 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,\r | |
34 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r | |
35 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,\r | |
36 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r | |
37 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,\r | |
38 | ];\r | |
39 | \r | |
40 | static HEX_CHARS: &[u8] = b"0123456789ABCDEF";\r | |
41 | static AMP_ESCAPE: &str = "&";\r | |
42 | static SLASH_ESCAPE: &str = "'";\r | |
43 | \r | |
44 | pub(crate) fn escape_href<W>(mut w: W, s: &str) -> io::Result<()>\r | |
45 | where\r | |
46 | W: StrWrite,\r | |
47 | {\r | |
48 | let bytes = s.as_bytes();\r | |
49 | let mut mark = 0;\r | |
50 | for i in 0..bytes.len() {\r | |
51 | let c = bytes[i];\r | |
52 | if c >= 0x80 || HREF_SAFE[c as usize] == 0 {\r | |
53 | // character needing escape\r | |
54 | \r | |
55 | // write partial substring up to mark\r | |
56 | if mark < i {\r | |
57 | w.write_str(&s[mark..i])?;\r | |
58 | }\r | |
59 | match c {\r | |
60 | b'&' => {\r | |
61 | w.write_str(AMP_ESCAPE)?;\r | |
62 | }\r | |
63 | b'\'' => {\r | |
64 | w.write_str(SLASH_ESCAPE)?;\r | |
65 | }\r | |
66 | _ => {\r | |
67 | let mut buf = [0u8; 3];\r | |
68 | buf[0] = b'%';\r | |
69 | buf[1] = HEX_CHARS[((c as usize) >> 4) & 0xF];\r | |
70 | buf[2] = HEX_CHARS[(c as usize) & 0xF];\r | |
71 | let escaped = from_utf8(&buf).unwrap();\r | |
72 | w.write_str(escaped)?;\r | |
73 | }\r | |
74 | }\r | |
75 | mark = i + 1; // all escaped characters are ASCII\r | |
76 | }\r | |
77 | }\r | |
78 | w.write_str(&s[mark..])\r | |
79 | }\r | |
80 | \r | |
81 | const fn create_html_escape_table() -> [u8; 256] {\r | |
82 | let mut table = [0; 256];\r | |
83 | table[b'"' as usize] = 1;\r | |
84 | table[b'&' as usize] = 2;\r | |
85 | table[b'<' as usize] = 3;\r | |
86 | table[b'>' as usize] = 4;\r | |
87 | table\r | |
88 | }\r | |
89 | \r | |
90 | static HTML_ESCAPE_TABLE: [u8; 256] = create_html_escape_table();\r | |
91 | \r | |
92 | static HTML_ESCAPES: [&'static str; 5] = ["", """, "&", "<", ">"];\r | |
93 | \r | |
94 | /// Writes the given string to the Write sink, replacing special HTML bytes\r | |
95 | /// (<, >, &, ") by escape sequences.\r | |
96 | pub(crate) fn escape_html<W: StrWrite>(w: W, s: &str) -> io::Result<()> {\r | |
97 | #[cfg(all(target_arch = "x86_64", feature = "simd"))]\r | |
98 | {\r | |
99 | simd::escape_html(w, s)\r | |
100 | }\r | |
101 | #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]\r | |
102 | {\r | |
103 | escape_html_scalar(w, s)\r | |
104 | }\r | |
105 | }\r | |
106 | \r | |
107 | fn escape_html_scalar<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {\r | |
108 | let bytes = s.as_bytes();\r | |
109 | let mut mark = 0;\r | |
110 | let mut i = 0;\r | |
111 | while i < s.len() {\r | |
112 | match bytes[i..]\r | |
113 | .iter()\r | |
114 | .position(|&c| HTML_ESCAPE_TABLE[c as usize] != 0)\r | |
115 | {\r | |
116 | Some(pos) => {\r | |
117 | i += pos;\r | |
118 | }\r | |
119 | None => break,\r | |
120 | }\r | |
121 | let c = bytes[i];\r | |
122 | let escape = HTML_ESCAPE_TABLE[c as usize];\r | |
123 | let escape_seq = HTML_ESCAPES[escape as usize];\r | |
124 | w.write_str(&s[mark..i])?;\r | |
125 | w.write_str(escape_seq)?;\r | |
126 | i += 1;\r | |
127 | mark = i; // all escaped characters are ASCII\r | |
128 | }\r | |
129 | w.write_str(&s[mark..])\r | |
130 | }\r | |
131 | \r | |
132 | #[cfg(all(target_arch = "x86_64", feature = "simd"))]\r | |
133 | mod simd {\r | |
134 | use crate::html::StrWrite;\r | |
135 | use std::arch::x86_64::*;\r | |
136 | use std::io;\r | |
137 | use std::mem::size_of;\r | |
138 | \r | |
139 | const VECTOR_SIZE: usize = size_of::<__m128i>();\r | |
140 | \r | |
141 | pub(crate) fn escape_html<W: StrWrite>(mut w: W, s: &str) -> io::Result<()> {\r | |
142 | // The SIMD accelerated code uses the PSHUFB instruction, which is part\r | |
143 | // of the SSSE3 instruction set. Further, we can only use this code if\r | |
144 | // the buffer is at least one VECTOR_SIZE in length to prevent reading\r | |
145 | // out of bounds. If either of these conditions is not met, we fall back\r | |
146 | // to scalar code.\r | |
147 | if is_x86_feature_detected!("ssse3") && s.len() >= VECTOR_SIZE {\r | |
148 | let bytes = s.as_bytes();\r | |
149 | let mut mark = 0;\r | |
150 | \r | |
151 | unsafe {\r | |
152 | foreach_special_simd(bytes, 0, |i| {\r | |
153 | let escape_ix = *bytes.get_unchecked(i) as usize;\r | |
154 | let replacement =\r | |
155 | super::HTML_ESCAPES[super::HTML_ESCAPE_TABLE[escape_ix] as usize];\r | |
156 | w.write_str(&s.get_unchecked(mark..i))?;\r | |
157 | mark = i + 1; // all escaped characters are ASCII\r | |
158 | w.write_str(replacement)\r | |
159 | })?;\r | |
160 | w.write_str(&s.get_unchecked(mark..))\r | |
161 | }\r | |
162 | } else {\r | |
163 | super::escape_html_scalar(w, s)\r | |
164 | }\r | |
165 | }\r | |
166 | \r | |
167 | /// Creates the lookup table for use in `compute_mask`.\r | |
168 | const fn create_lookup() -> [u8; 16] {\r | |
169 | let mut table = [0; 16];\r | |
170 | table[(b'<' & 0x0f) as usize] = b'<';\r | |
171 | table[(b'>' & 0x0f) as usize] = b'>';\r | |
172 | table[(b'&' & 0x0f) as usize] = b'&';\r | |
173 | table[(b'"' & 0x0f) as usize] = b'"';\r | |
174 | table[0] = 0b0111_1111;\r | |
175 | table\r | |
176 | }\r | |
177 | \r | |
178 | #[target_feature(enable = "ssse3")]\r | |
179 | /// Computes a byte mask at given offset in the byte buffer. Its first 16 (least significant)\r | |
180 | /// bits correspond to whether there is an HTML special byte (&, <, ", >) at the 16 bytes\r | |
181 | /// `bytes[offset..]`. For example, the mask `(1 << 3)` states that there is an HTML byte\r | |
182 | /// at `offset + 3`. It is only safe to call this function when\r | |
183 | /// `bytes.len() >= offset + VECTOR_SIZE`.\r | |
184 | unsafe fn compute_mask(bytes: &[u8], offset: usize) -> i32 {\r | |
185 | debug_assert!(bytes.len() >= offset + VECTOR_SIZE);\r | |
186 | \r | |
187 | let table = create_lookup();\r | |
188 | let lookup = _mm_loadu_si128(table.as_ptr() as *const __m128i);\r | |
189 | let raw_ptr = bytes.as_ptr().offset(offset as isize) as *const __m128i;\r | |
190 | \r | |
191 | // Load the vector from memory.\r | |
192 | let vector = _mm_loadu_si128(raw_ptr);\r | |
193 | // We take the least significant 4 bits of every byte and use them as indices\r | |
194 | // to map into the lookup vector.\r | |
195 | // Note that shuffle maps bytes with their most significant bit set to lookup[0].\r | |
196 | // Bytes that share their lower nibble with an HTML special byte get mapped to that\r | |
197 | // corresponding special byte. Note that all HTML special bytes have distinct lower\r | |
198 | // nibbles. Other bytes either get mapped to 0 or 127.\r | |
199 | let expected = _mm_shuffle_epi8(lookup, vector);\r | |
200 | // We compare the original vector to the mapped output. Bytes that shared a lower\r | |
201 | // nibble with an HTML special byte match *only* if they are that special byte. Bytes\r | |
202 | // that have either a 0 lower nibble or their most significant bit set were mapped to\r | |
203 | // 127 and will hence never match. All other bytes have non-zero lower nibbles but\r | |
204 | // were mapped to 0 and will therefore also not match.\r | |
205 | let matches = _mm_cmpeq_epi8(expected, vector);\r | |
206 | \r | |
207 | // Translate matches to a bitmask, where every 1 corresponds to a HTML special character\r | |
208 | // and a 0 is a non-HTML byte.\r | |
209 | _mm_movemask_epi8(matches)\r | |
210 | }\r | |
211 | \r | |
212 | /// Calls the given function with the index of every byte in the given byteslice\r | |
213 | /// that is either ", &, <, or > and for no other byte.\r | |
214 | /// Make sure to only call this when `bytes.len() >= 16`, undefined behaviour may\r | |
215 | /// occur otherwise.\r | |
216 | #[target_feature(enable = "ssse3")]\r | |
217 | unsafe fn foreach_special_simd<F>(\r | |
218 | bytes: &[u8],\r | |
219 | mut offset: usize,\r | |
220 | mut callback: F,\r | |
221 | ) -> io::Result<()>\r | |
222 | where\r | |
223 | F: FnMut(usize) -> io::Result<()>,\r | |
224 | {\r | |
225 | // The strategy here is to walk the byte buffer in chunks of VECTOR_SIZE (16)\r | |
226 | // bytes at a time starting at the given offset. For each chunk, we compute a\r | |
227 | // a bitmask indicating whether the corresponding byte is a HTML special byte.\r | |
228 | // We then iterate over all the 1 bits in this mask and call the callback function\r | |
229 | // with the corresponding index in the buffer.\r | |
230 | // When the number of HTML special bytes in the buffer is relatively low, this\r | |
231 | // allows us to quickly go through the buffer without a lookup and for every\r | |
232 | // single byte.\r | |
233 | \r | |
234 | debug_assert!(bytes.len() >= VECTOR_SIZE);\r | |
235 | let upperbound = bytes.len() - VECTOR_SIZE;\r | |
236 | while offset < upperbound {\r | |
237 | let mut mask = compute_mask(bytes, offset);\r | |
238 | while mask != 0 {\r | |
239 | let ix = mask.trailing_zeros();\r | |
240 | callback(offset + ix as usize)?;\r | |
241 | mask ^= mask & -mask;\r | |
242 | }\r | |
243 | offset += VECTOR_SIZE;\r | |
244 | }\r | |
245 | \r | |
246 | // Final iteration. We align the read with the end of the slice and\r | |
247 | // shift off the bytes at start we have already scanned.\r | |
248 | let mut mask = compute_mask(bytes, upperbound);\r | |
249 | mask >>= offset - upperbound;\r | |
250 | while mask != 0 {\r | |
251 | let ix = mask.trailing_zeros();\r | |
252 | callback(offset + ix as usize)?;\r | |
253 | mask ^= mask & -mask;\r | |
254 | }\r | |
255 | Ok(())\r | |
256 | }\r | |
257 | \r | |
258 | #[cfg(test)]\r | |
259 | mod html_scan_tests {\r | |
260 | #[test]\r | |
261 | fn multichunk() {\r | |
262 | let mut vec = Vec::new();\r | |
263 | unsafe {\r | |
264 | super::foreach_special_simd("&aXaaaa.a'aa9a<>aab&".as_bytes(), 0, |ix| {\r | |
265 | Ok(vec.push(ix))\r | |
266 | })\r | |
267 | .unwrap();\r | |
268 | }\r | |
269 | assert_eq!(vec, vec![0, 14, 15, 19]);\r | |
270 | }\r | |
271 | \r | |
272 | // only match these bytes, and when we match them, match them VECTOR_SIZE times\r | |
273 | #[test]\r | |
274 | fn only_right_bytes_matched() {\r | |
275 | for b in 0..255u8 {\r | |
276 | let right_byte = b == b'&' || b == b'<' || b == b'>' || b == b'"';\r | |
277 | let vek = vec![b; super::VECTOR_SIZE];\r | |
278 | let mut match_count = 0;\r | |
279 | unsafe {\r | |
280 | super::foreach_special_simd(&vek, 0, |_| {\r | |
281 | match_count += 1;\r | |
282 | Ok(())\r | |
283 | })\r | |
284 | .unwrap();\r | |
285 | }\r | |
286 | assert!((match_count > 0) == (match_count == super::VECTOR_SIZE));\r | |
287 | assert_eq!(\r | |
288 | (match_count == super::VECTOR_SIZE),\r | |
289 | right_byte,\r | |
290 | "match_count: {}, byte: {:?}",\r | |
291 | match_count,\r | |
292 | b as char\r | |
293 | );\r | |
294 | }\r | |
295 | }\r | |
296 | }\r | |
297 | }\r |