]>
Commit | Line | Data |
---|---|---|
8faf50e0 XL |
1 | use unicode_width::UnicodeWidthChar; |
2 | use super::*; | |
3 | ||
9fa01778 | 4 | /// Finds all newlines, multi-byte characters, and non-narrow characters in a |
b7449926 | 5 | /// SourceFile. |
8faf50e0 XL |
6 | /// |
7 | /// This function will use an SSE2 enhanced implementation if hardware support | |
8 | /// is detected at runtime. | |
b7449926 | 9 | pub fn analyze_source_file( |
8faf50e0 | 10 | src: &str, |
b7449926 | 11 | source_file_start_pos: BytePos) |
8faf50e0 XL |
12 | -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) |
13 | { | |
b7449926 | 14 | let mut lines = vec![source_file_start_pos]; |
8faf50e0 XL |
15 | let mut multi_byte_chars = vec![]; |
16 | let mut non_narrow_chars = vec![]; | |
17 | ||
18 | // Calls the right implementation, depending on hardware support available. | |
b7449926 XL |
19 | analyze_source_file_dispatch(src, |
20 | source_file_start_pos, | |
8faf50e0 XL |
21 | &mut lines, |
22 | &mut multi_byte_chars, | |
23 | &mut non_narrow_chars); | |
24 | ||
25 | // The code above optimistically registers a new line *after* each \n | |
b7449926 | 26 | // it encounters. If that point is already outside the source_file, remove |
8faf50e0 XL |
27 | // it again. |
28 | if let Some(&last_line_start) = lines.last() { | |
a1dfa0c6 XL |
29 | let source_file_end = source_file_start_pos + BytePos::from_usize(src.len()); |
30 | assert!(source_file_end >= last_line_start); | |
31 | if last_line_start == source_file_end { | |
8faf50e0 XL |
32 | lines.pop(); |
33 | } | |
34 | } | |
35 | ||
36 | (lines, multi_byte_chars, non_narrow_chars) | |
37 | } | |
38 | ||
9fa01778 | 39 | cfg_if::cfg_if! { |
0731742a | 40 | if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] { |
b7449926 XL |
41 | fn analyze_source_file_dispatch(src: &str, |
42 | source_file_start_pos: BytePos, | |
8faf50e0 XL |
43 | lines: &mut Vec<BytePos>, |
44 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
45 | non_narrow_chars: &mut Vec<NonNarrowChar>) { | |
46 | if is_x86_feature_detected!("sse2") { | |
47 | unsafe { | |
b7449926 XL |
48 | analyze_source_file_sse2(src, |
49 | source_file_start_pos, | |
8faf50e0 XL |
50 | lines, |
51 | multi_byte_chars, | |
52 | non_narrow_chars); | |
53 | } | |
54 | } else { | |
b7449926 | 55 | analyze_source_file_generic(src, |
8faf50e0 | 56 | src.len(), |
b7449926 | 57 | source_file_start_pos, |
8faf50e0 XL |
58 | lines, |
59 | multi_byte_chars, | |
60 | non_narrow_chars); | |
61 | ||
62 | } | |
63 | } | |
64 | ||
9fa01778 | 65 | /// Checks 16 byte chunks of text at a time. If the chunk contains |
8faf50e0 XL |
66 | /// something other than printable ASCII characters and newlines, the |
67 | /// function falls back to the generic implementation. Otherwise it uses | |
68 | /// SSE2 intrinsics to quickly find all newlines. | |
69 | #[target_feature(enable = "sse2")] | |
b7449926 | 70 | unsafe fn analyze_source_file_sse2(src: &str, |
8faf50e0 XL |
71 | output_offset: BytePos, |
72 | lines: &mut Vec<BytePos>, | |
73 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
74 | non_narrow_chars: &mut Vec<NonNarrowChar>) { | |
75 | #[cfg(target_arch = "x86")] | |
76 | use std::arch::x86::*; | |
77 | #[cfg(target_arch = "x86_64")] | |
78 | use std::arch::x86_64::*; | |
79 | ||
80 | const CHUNK_SIZE: usize = 16; | |
81 | ||
82 | let src_bytes = src.as_bytes(); | |
83 | ||
84 | let chunk_count = src.len() / CHUNK_SIZE; | |
85 | ||
86 | // This variable keeps track of where we should start decoding a | |
87 | // chunk. If a multi-byte character spans across chunk boundaries, | |
88 | // we need to skip that part in the next chunk because we already | |
89 | // handled it. | |
90 | let mut intra_chunk_offset = 0; | |
91 | ||
92 | for chunk_index in 0 .. chunk_count { | |
93 | let ptr = src_bytes.as_ptr() as *const __m128i; | |
94 | // We don't know if the pointer is aligned to 16 bytes, so we | |
95 | // use `loadu`, which supports unaligned loading. | |
96 | let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize)); | |
97 | ||
98 | // For character in the chunk, see if its byte value is < 0, which | |
99 | // indicates that it's part of a UTF-8 char. | |
100 | let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)); | |
101 | // Create a bit mask from the comparison results. | |
102 | let multibyte_mask = _mm_movemask_epi8(multibyte_test); | |
103 | ||
104 | // If the bit mask is all zero, we only have ASCII chars here: | |
105 | if multibyte_mask == 0 { | |
106 | assert!(intra_chunk_offset == 0); | |
107 | ||
108 | // Check if there are any control characters in the chunk. All | |
109 | // control characters that we can encounter at this point have a | |
110 | // byte value less than 32 or ... | |
111 | let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)); | |
112 | let control_char_mask0 = _mm_movemask_epi8(control_char_test0); | |
113 | ||
114 | // ... it's the ASCII 'DEL' character with a value of 127. | |
115 | let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)); | |
116 | let control_char_mask1 = _mm_movemask_epi8(control_char_test1); | |
117 | ||
118 | let control_char_mask = control_char_mask0 | control_char_mask1; | |
119 | ||
120 | if control_char_mask != 0 { | |
121 | // Check for newlines in the chunk | |
122 | let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)); | |
123 | let newlines_mask = _mm_movemask_epi8(newlines_test); | |
124 | ||
125 | if control_char_mask == newlines_mask { | |
126 | // All control characters are newlines, record them | |
127 | let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; | |
128 | let output_offset = output_offset + | |
129 | BytePos::from_usize(chunk_index * CHUNK_SIZE + 1); | |
130 | ||
131 | loop { | |
132 | let index = newlines_mask.trailing_zeros(); | |
133 | ||
134 | if index >= CHUNK_SIZE as u32 { | |
135 | // We have arrived at the end of the chunk. | |
136 | break | |
137 | } | |
138 | ||
139 | lines.push(BytePos(index) + output_offset); | |
140 | ||
141 | // Clear the bit, so we can find the next one. | |
142 | newlines_mask &= (!1) << index; | |
143 | } | |
144 | ||
145 | // We are done for this chunk. All control characters were | |
146 | // newlines and we took care of those. | |
147 | continue | |
148 | } else { | |
149 | // Some of the control characters are not newlines, | |
150 | // fall through to the slow path below. | |
151 | } | |
152 | } else { | |
153 | // No control characters, nothing to record for this chunk | |
154 | continue | |
155 | } | |
156 | } | |
157 | ||
158 | // The slow path. | |
159 | // There are control chars in here, fallback to generic decoding. | |
160 | let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; | |
b7449926 | 161 | intra_chunk_offset = analyze_source_file_generic( |
8faf50e0 XL |
162 | &src[scan_start .. ], |
163 | CHUNK_SIZE - intra_chunk_offset, | |
164 | BytePos::from_usize(scan_start) + output_offset, | |
165 | lines, | |
166 | multi_byte_chars, | |
167 | non_narrow_chars | |
168 | ); | |
169 | } | |
170 | ||
171 | // There might still be a tail left to analyze | |
172 | let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; | |
173 | if tail_start < src.len() { | |
b7449926 | 174 | analyze_source_file_generic(&src[tail_start as usize ..], |
8faf50e0 XL |
175 | src.len() - tail_start, |
176 | output_offset + BytePos::from_usize(tail_start), | |
177 | lines, | |
178 | multi_byte_chars, | |
179 | non_narrow_chars); | |
180 | } | |
181 | } | |
182 | } else { | |
183 | ||
184 | // The target (or compiler version) does not support SSE2 ... | |
b7449926 XL |
185 | fn analyze_source_file_dispatch(src: &str, |
186 | source_file_start_pos: BytePos, | |
8faf50e0 XL |
187 | lines: &mut Vec<BytePos>, |
188 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
189 | non_narrow_chars: &mut Vec<NonNarrowChar>) { | |
b7449926 | 190 | analyze_source_file_generic(src, |
8faf50e0 | 191 | src.len(), |
b7449926 | 192 | source_file_start_pos, |
8faf50e0 XL |
193 | lines, |
194 | multi_byte_chars, | |
195 | non_narrow_chars); | |
196 | } | |
197 | } | |
198 | } | |
199 | ||
200 | // `scan_len` determines the number of bytes in `src` to scan. Note that the | |
201 | // function can read past `scan_len` if a multi-byte character start within the | |
202 | // range but extends past it. The overflow is returned by the function. | |
b7449926 | 203 | fn analyze_source_file_generic(src: &str, |
8faf50e0 XL |
204 | scan_len: usize, |
205 | output_offset: BytePos, | |
206 | lines: &mut Vec<BytePos>, | |
207 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
208 | non_narrow_chars: &mut Vec<NonNarrowChar>) | |
209 | -> usize | |
210 | { | |
211 | assert!(src.len() >= scan_len); | |
212 | let mut i = 0; | |
213 | let src_bytes = src.as_bytes(); | |
214 | ||
215 | while i < scan_len { | |
216 | let byte = unsafe { | |
217 | // We verified that i < scan_len <= src.len() | |
218 | *src_bytes.get_unchecked(i as usize) | |
219 | }; | |
220 | ||
221 | // How much to advance in order to get to the next UTF-8 char in the | |
222 | // string. | |
223 | let mut char_len = 1; | |
224 | ||
225 | if byte < 32 { | |
226 | // This is an ASCII control character, it could be one of the cases | |
227 | // that are interesting to us. | |
228 | ||
229 | let pos = BytePos::from_usize(i) + output_offset; | |
230 | ||
231 | match byte { | |
232 | b'\n' => { | |
233 | lines.push(pos + BytePos(1)); | |
234 | } | |
235 | b'\t' => { | |
236 | non_narrow_chars.push(NonNarrowChar::Tab(pos)); | |
237 | } | |
238 | _ => { | |
239 | non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos)); | |
240 | } | |
241 | } | |
242 | } else if byte >= 127 { | |
243 | // The slow path: | |
244 | // This is either ASCII control character "DEL" or the beginning of | |
245 | // a multibyte char. Just decode to `char`. | |
246 | let c = (&src[i..]).chars().next().unwrap(); | |
247 | char_len = c.len_utf8(); | |
248 | ||
249 | let pos = BytePos::from_usize(i) + output_offset; | |
250 | ||
251 | if char_len > 1 { | |
252 | assert!(char_len >=2 && char_len <= 4); | |
253 | let mbc = MultiByteChar { | |
254 | pos, | |
255 | bytes: char_len as u8, | |
256 | }; | |
257 | multi_byte_chars.push(mbc); | |
258 | } | |
259 | ||
260 | // Assume control characters are zero width. | |
261 | // FIXME: How can we decide between `width` and `width_cjk`? | |
262 | let char_width = UnicodeWidthChar::width(c).unwrap_or(0); | |
263 | ||
264 | if char_width != 1 { | |
265 | non_narrow_chars.push(NonNarrowChar::new(pos, char_width)); | |
266 | } | |
267 | } | |
268 | ||
269 | i += char_len; | |
270 | } | |
271 | ||
272 | i - scan_len | |
273 | } | |
274 | ||
275 | ||
276 | ||
277 | macro_rules! test { | |
278 | (case: $test_name:ident, | |
279 | text: $text:expr, | |
b7449926 | 280 | source_file_start_pos: $source_file_start_pos:expr, |
8faf50e0 XL |
281 | lines: $lines:expr, |
282 | multi_byte_chars: $multi_byte_chars:expr, | |
283 | non_narrow_chars: $non_narrow_chars:expr,) => ( | |
284 | ||
285 | #[test] | |
286 | fn $test_name() { | |
287 | ||
288 | let (lines, multi_byte_chars, non_narrow_chars) = | |
b7449926 | 289 | analyze_source_file($text, BytePos($source_file_start_pos)); |
8faf50e0 XL |
290 | |
291 | let expected_lines: Vec<BytePos> = $lines | |
292 | .into_iter() | |
293 | .map(|pos| BytePos(pos)) | |
294 | .collect(); | |
295 | ||
296 | assert_eq!(lines, expected_lines); | |
297 | ||
298 | let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars | |
299 | .into_iter() | |
300 | .map(|(pos, bytes)| MultiByteChar { | |
301 | pos: BytePos(pos), | |
302 | bytes, | |
303 | }) | |
304 | .collect(); | |
305 | ||
306 | assert_eq!(multi_byte_chars, expected_mbcs); | |
307 | ||
308 | let expected_nncs: Vec<NonNarrowChar> = $non_narrow_chars | |
309 | .into_iter() | |
310 | .map(|(pos, width)| { | |
311 | NonNarrowChar::new(BytePos(pos), width) | |
312 | }) | |
313 | .collect(); | |
314 | ||
315 | assert_eq!(non_narrow_chars, expected_nncs); | |
316 | }) | |
317 | } | |
318 | ||
319 | test!( | |
320 | case: empty_text, | |
321 | text: "", | |
b7449926 | 322 | source_file_start_pos: 0, |
8faf50e0 XL |
323 | lines: vec![], |
324 | multi_byte_chars: vec![], | |
325 | non_narrow_chars: vec![], | |
326 | ); | |
327 | ||
328 | test!( | |
329 | case: newlines_short, | |
330 | text: "a\nc", | |
b7449926 | 331 | source_file_start_pos: 0, |
8faf50e0 XL |
332 | lines: vec![0, 2], |
333 | multi_byte_chars: vec![], | |
334 | non_narrow_chars: vec![], | |
335 | ); | |
336 | ||
337 | test!( | |
338 | case: newlines_long, | |
339 | text: "012345678\nabcdef012345678\na", | |
b7449926 | 340 | source_file_start_pos: 0, |
8faf50e0 XL |
341 | lines: vec![0, 10, 26], |
342 | multi_byte_chars: vec![], | |
343 | non_narrow_chars: vec![], | |
344 | ); | |
345 | ||
346 | test!( | |
347 | case: newline_and_multi_byte_char_in_same_chunk, | |
348 | text: "01234β789\nbcdef0123456789abcdef", | |
b7449926 | 349 | source_file_start_pos: 0, |
8faf50e0 XL |
350 | lines: vec![0, 11], |
351 | multi_byte_chars: vec![(5, 2)], | |
352 | non_narrow_chars: vec![], | |
353 | ); | |
354 | ||
355 | test!( | |
356 | case: newline_and_control_char_in_same_chunk, | |
357 | text: "01234\u{07}6789\nbcdef0123456789abcdef", | |
b7449926 | 358 | source_file_start_pos: 0, |
8faf50e0 XL |
359 | lines: vec![0, 11], |
360 | multi_byte_chars: vec![], | |
361 | non_narrow_chars: vec![(5, 0)], | |
362 | ); | |
363 | ||
364 | test!( | |
365 | case: multi_byte_char_short, | |
366 | text: "aβc", | |
b7449926 | 367 | source_file_start_pos: 0, |
8faf50e0 XL |
368 | lines: vec![0], |
369 | multi_byte_chars: vec![(1, 2)], | |
370 | non_narrow_chars: vec![], | |
371 | ); | |
372 | ||
373 | test!( | |
374 | case: multi_byte_char_long, | |
375 | text: "0123456789abcΔf012345β", | |
b7449926 | 376 | source_file_start_pos: 0, |
8faf50e0 XL |
377 | lines: vec![0], |
378 | multi_byte_chars: vec![(13, 2), (22, 2)], | |
379 | non_narrow_chars: vec![], | |
380 | ); | |
381 | ||
382 | test!( | |
383 | case: multi_byte_char_across_chunk_boundary, | |
384 | text: "0123456789abcdeΔ123456789abcdef01234", | |
b7449926 | 385 | source_file_start_pos: 0, |
8faf50e0 XL |
386 | lines: vec![0], |
387 | multi_byte_chars: vec![(15, 2)], | |
388 | non_narrow_chars: vec![], | |
389 | ); | |
390 | ||
391 | test!( | |
392 | case: multi_byte_char_across_chunk_boundary_tail, | |
393 | text: "0123456789abcdeΔ....", | |
b7449926 | 394 | source_file_start_pos: 0, |
8faf50e0 XL |
395 | lines: vec![0], |
396 | multi_byte_chars: vec![(15, 2)], | |
397 | non_narrow_chars: vec![], | |
398 | ); | |
399 | ||
400 | test!( | |
401 | case: non_narrow_short, | |
402 | text: "0\t2", | |
b7449926 | 403 | source_file_start_pos: 0, |
8faf50e0 XL |
404 | lines: vec![0], |
405 | multi_byte_chars: vec![], | |
406 | non_narrow_chars: vec![(1, 4)], | |
407 | ); | |
408 | ||
409 | test!( | |
410 | case: non_narrow_long, | |
411 | text: "01\t3456789abcdef01234567\u{07}9", | |
b7449926 | 412 | source_file_start_pos: 0, |
8faf50e0 XL |
413 | lines: vec![0], |
414 | multi_byte_chars: vec![], | |
415 | non_narrow_chars: vec![(2, 4), (24, 0)], | |
416 | ); | |
417 | ||
418 | test!( | |
419 | case: output_offset_all, | |
420 | text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf", | |
b7449926 | 421 | source_file_start_pos: 1000, |
8faf50e0 XL |
422 | lines: vec![0 + 1000, 7 + 1000, 27 + 1000], |
423 | multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)], | |
424 | non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)], | |
425 | ); |