]>
Commit | Line | Data |
---|---|---|
8faf50e0 | 1 | use super::*; |
dfeec247 | 2 | use unicode_width::UnicodeWidthChar; |
8faf50e0 | 3 | |
416331ca XL |
4 | #[cfg(test)] |
5 | mod tests; | |
6 | ||
9fa01778 | 7 | /// Finds all newlines, multi-byte characters, and non-narrow characters in a |
b7449926 | 8 | /// SourceFile. |
8faf50e0 XL |
9 | /// |
10 | /// This function will use an SSE2 enhanced implementation if hardware support | |
11 | /// is detected at runtime. | |
b7449926 | 12 | pub fn analyze_source_file( |
8faf50e0 | 13 | src: &str, |
dfeec247 XL |
14 | source_file_start_pos: BytePos, |
15 | ) -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) { | |
b7449926 | 16 | let mut lines = vec![source_file_start_pos]; |
8faf50e0 XL |
17 | let mut multi_byte_chars = vec![]; |
18 | let mut non_narrow_chars = vec![]; | |
19 | ||
20 | // Calls the right implementation, depending on hardware support available. | |
dfeec247 XL |
21 | analyze_source_file_dispatch( |
22 | src, | |
23 | source_file_start_pos, | |
24 | &mut lines, | |
25 | &mut multi_byte_chars, | |
26 | &mut non_narrow_chars, | |
27 | ); | |
8faf50e0 XL |
28 | |
29 | // The code above optimistically registers a new line *after* each \n | |
b7449926 | 30 | // it encounters. If that point is already outside the source_file, remove |
8faf50e0 XL |
31 | // it again. |
32 | if let Some(&last_line_start) = lines.last() { | |
a1dfa0c6 XL |
33 | let source_file_end = source_file_start_pos + BytePos::from_usize(src.len()); |
34 | assert!(source_file_end >= last_line_start); | |
35 | if last_line_start == source_file_end { | |
8faf50e0 XL |
36 | lines.pop(); |
37 | } | |
38 | } | |
39 | ||
40 | (lines, multi_byte_chars, non_narrow_chars) | |
41 | } | |
42 | ||
9fa01778 | 43 | cfg_if::cfg_if! { |
487cf647 | 44 | if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { |
b7449926 XL |
45 | fn analyze_source_file_dispatch(src: &str, |
46 | source_file_start_pos: BytePos, | |
8faf50e0 XL |
47 | lines: &mut Vec<BytePos>, |
48 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
49 | non_narrow_chars: &mut Vec<NonNarrowChar>) { | |
50 | if is_x86_feature_detected!("sse2") { | |
51 | unsafe { | |
b7449926 XL |
52 | analyze_source_file_sse2(src, |
53 | source_file_start_pos, | |
8faf50e0 XL |
54 | lines, |
55 | multi_byte_chars, | |
56 | non_narrow_chars); | |
57 | } | |
58 | } else { | |
b7449926 | 59 | analyze_source_file_generic(src, |
8faf50e0 | 60 | src.len(), |
b7449926 | 61 | source_file_start_pos, |
8faf50e0 XL |
62 | lines, |
63 | multi_byte_chars, | |
64 | non_narrow_chars); | |
65 | ||
66 | } | |
67 | } | |
68 | ||
9fa01778 | 69 | /// Checks 16 byte chunks of text at a time. If the chunk contains |
8faf50e0 XL |
70 | /// something other than printable ASCII characters and newlines, the |
71 | /// function falls back to the generic implementation. Otherwise it uses | |
72 | /// SSE2 intrinsics to quickly find all newlines. | |
73 | #[target_feature(enable = "sse2")] | |
b7449926 | 74 | unsafe fn analyze_source_file_sse2(src: &str, |
8faf50e0 XL |
75 | output_offset: BytePos, |
76 | lines: &mut Vec<BytePos>, | |
77 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
78 | non_narrow_chars: &mut Vec<NonNarrowChar>) { | |
79 | #[cfg(target_arch = "x86")] | |
80 | use std::arch::x86::*; | |
81 | #[cfg(target_arch = "x86_64")] | |
82 | use std::arch::x86_64::*; | |
83 | ||
84 | const CHUNK_SIZE: usize = 16; | |
85 | ||
86 | let src_bytes = src.as_bytes(); | |
87 | ||
88 | let chunk_count = src.len() / CHUNK_SIZE; | |
89 | ||
90 | // This variable keeps track of where we should start decoding a | |
91 | // chunk. If a multi-byte character spans across chunk boundaries, | |
92 | // we need to skip that part in the next chunk because we already | |
93 | // handled it. | |
94 | let mut intra_chunk_offset = 0; | |
95 | ||
96 | for chunk_index in 0 .. chunk_count { | |
97 | let ptr = src_bytes.as_ptr() as *const __m128i; | |
98 | // We don't know if the pointer is aligned to 16 bytes, so we | |
99 | // use `loadu`, which supports unaligned loading. | |
fc512014 | 100 | let chunk = _mm_loadu_si128(ptr.add(chunk_index)); |
8faf50e0 XL |
101 | |
102 | // For character in the chunk, see if its byte value is < 0, which | |
103 | // indicates that it's part of a UTF-8 char. | |
104 | let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)); | |
105 | // Create a bit mask from the comparison results. | |
106 | let multibyte_mask = _mm_movemask_epi8(multibyte_test); | |
107 | ||
108 | // If the bit mask is all zero, we only have ASCII chars here: | |
109 | if multibyte_mask == 0 { | |
110 | assert!(intra_chunk_offset == 0); | |
111 | ||
112 | // Check if there are any control characters in the chunk. All | |
113 | // control characters that we can encounter at this point have a | |
114 | // byte value less than 32 or ... | |
115 | let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)); | |
116 | let control_char_mask0 = _mm_movemask_epi8(control_char_test0); | |
117 | ||
118 | // ... it's the ASCII 'DEL' character with a value of 127. | |
119 | let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)); | |
120 | let control_char_mask1 = _mm_movemask_epi8(control_char_test1); | |
121 | ||
122 | let control_char_mask = control_char_mask0 | control_char_mask1; | |
123 | ||
124 | if control_char_mask != 0 { | |
125 | // Check for newlines in the chunk | |
126 | let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)); | |
127 | let newlines_mask = _mm_movemask_epi8(newlines_test); | |
128 | ||
129 | if control_char_mask == newlines_mask { | |
130 | // All control characters are newlines, record them | |
131 | let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; | |
132 | let output_offset = output_offset + | |
133 | BytePos::from_usize(chunk_index * CHUNK_SIZE + 1); | |
134 | ||
135 | loop { | |
136 | let index = newlines_mask.trailing_zeros(); | |
137 | ||
138 | if index >= CHUNK_SIZE as u32 { | |
139 | // We have arrived at the end of the chunk. | |
140 | break | |
141 | } | |
142 | ||
143 | lines.push(BytePos(index) + output_offset); | |
144 | ||
145 | // Clear the bit, so we can find the next one. | |
146 | newlines_mask &= (!1) << index; | |
147 | } | |
148 | ||
149 | // We are done for this chunk. All control characters were | |
150 | // newlines and we took care of those. | |
151 | continue | |
152 | } else { | |
153 | // Some of the control characters are not newlines, | |
154 | // fall through to the slow path below. | |
155 | } | |
156 | } else { | |
157 | // No control characters, nothing to record for this chunk | |
158 | continue | |
159 | } | |
160 | } | |
161 | ||
162 | // The slow path. | |
163 | // There are control chars in here, fallback to generic decoding. | |
164 | let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; | |
b7449926 | 165 | intra_chunk_offset = analyze_source_file_generic( |
8faf50e0 XL |
166 | &src[scan_start .. ], |
167 | CHUNK_SIZE - intra_chunk_offset, | |
168 | BytePos::from_usize(scan_start) + output_offset, | |
169 | lines, | |
170 | multi_byte_chars, | |
171 | non_narrow_chars | |
172 | ); | |
173 | } | |
174 | ||
175 | // There might still be a tail left to analyze | |
176 | let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; | |
177 | if tail_start < src.len() { | |
b7449926 | 178 | analyze_source_file_generic(&src[tail_start as usize ..], |
8faf50e0 XL |
179 | src.len() - tail_start, |
180 | output_offset + BytePos::from_usize(tail_start), | |
181 | lines, | |
182 | multi_byte_chars, | |
183 | non_narrow_chars); | |
184 | } | |
185 | } | |
186 | } else { | |
187 | ||
188 | // The target (or compiler version) does not support SSE2 ... | |
b7449926 XL |
189 | fn analyze_source_file_dispatch(src: &str, |
190 | source_file_start_pos: BytePos, | |
8faf50e0 XL |
191 | lines: &mut Vec<BytePos>, |
192 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
193 | non_narrow_chars: &mut Vec<NonNarrowChar>) { | |
b7449926 | 194 | analyze_source_file_generic(src, |
8faf50e0 | 195 | src.len(), |
b7449926 | 196 | source_file_start_pos, |
8faf50e0 XL |
197 | lines, |
198 | multi_byte_chars, | |
199 | non_narrow_chars); | |
200 | } | |
201 | } | |
202 | } | |
203 | ||
204 | // `scan_len` determines the number of bytes in `src` to scan. Note that the | |
205 | // function can read past `scan_len` if a multi-byte character start within the | |
206 | // range but extends past it. The overflow is returned by the function. | |
dfeec247 XL |
207 | fn analyze_source_file_generic( |
208 | src: &str, | |
209 | scan_len: usize, | |
210 | output_offset: BytePos, | |
211 | lines: &mut Vec<BytePos>, | |
212 | multi_byte_chars: &mut Vec<MultiByteChar>, | |
213 | non_narrow_chars: &mut Vec<NonNarrowChar>, | |
214 | ) -> usize { | |
8faf50e0 XL |
215 | assert!(src.len() >= scan_len); |
216 | let mut i = 0; | |
217 | let src_bytes = src.as_bytes(); | |
218 | ||
219 | while i < scan_len { | |
220 | let byte = unsafe { | |
221 | // We verified that i < scan_len <= src.len() | |
222 | *src_bytes.get_unchecked(i as usize) | |
223 | }; | |
224 | ||
225 | // How much to advance in order to get to the next UTF-8 char in the | |
226 | // string. | |
227 | let mut char_len = 1; | |
228 | ||
229 | if byte < 32 { | |
230 | // This is an ASCII control character, it could be one of the cases | |
231 | // that are interesting to us. | |
232 | ||
233 | let pos = BytePos::from_usize(i) + output_offset; | |
234 | ||
235 | match byte { | |
236 | b'\n' => { | |
237 | lines.push(pos + BytePos(1)); | |
238 | } | |
239 | b'\t' => { | |
240 | non_narrow_chars.push(NonNarrowChar::Tab(pos)); | |
241 | } | |
242 | _ => { | |
243 | non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos)); | |
244 | } | |
245 | } | |
246 | } else if byte >= 127 { | |
247 | // The slow path: | |
248 | // This is either ASCII control character "DEL" or the beginning of | |
249 | // a multibyte char. Just decode to `char`. | |
487cf647 | 250 | let c = src[i..].chars().next().unwrap(); |
8faf50e0 XL |
251 | char_len = c.len_utf8(); |
252 | ||
253 | let pos = BytePos::from_usize(i) + output_offset; | |
254 | ||
255 | if char_len > 1 { | |
fc512014 | 256 | assert!((2..=4).contains(&char_len)); |
dfeec247 | 257 | let mbc = MultiByteChar { pos, bytes: char_len as u8 }; |
8faf50e0 XL |
258 | multi_byte_chars.push(mbc); |
259 | } | |
260 | ||
261 | // Assume control characters are zero width. | |
262 | // FIXME: How can we decide between `width` and `width_cjk`? | |
263 | let char_width = UnicodeWidthChar::width(c).unwrap_or(0); | |
264 | ||
265 | if char_width != 1 { | |
266 | non_narrow_chars.push(NonNarrowChar::new(pos, char_width)); | |
267 | } | |
268 | } | |
269 | ||
270 | i += char_len; | |
271 | } | |
272 | ||
273 | i - scan_len | |
274 | } |