]> git.proxmox.com Git - rustc.git/blame - compiler/rustc_span/src/analyze_source_file.rs
New upstream version 1.67.1+dfsg1
[rustc.git] / compiler / rustc_span / src / analyze_source_file.rs
CommitLineData
8faf50e0 1use super::*;
dfeec247 2use unicode_width::UnicodeWidthChar;
8faf50e0 3
416331ca
XL
4#[cfg(test)]
5mod tests;
6
9fa01778 7/// Finds all newlines, multi-byte characters, and non-narrow characters in a
b7449926 8/// SourceFile.
8faf50e0
XL
9///
10/// This function will use an SSE2 enhanced implementation if hardware support
11/// is detected at runtime.
b7449926 12pub fn analyze_source_file(
8faf50e0 13 src: &str,
dfeec247
XL
14 source_file_start_pos: BytePos,
15) -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
b7449926 16 let mut lines = vec![source_file_start_pos];
8faf50e0
XL
17 let mut multi_byte_chars = vec![];
18 let mut non_narrow_chars = vec![];
19
20 // Calls the right implementation, depending on hardware support available.
dfeec247
XL
21 analyze_source_file_dispatch(
22 src,
23 source_file_start_pos,
24 &mut lines,
25 &mut multi_byte_chars,
26 &mut non_narrow_chars,
27 );
8faf50e0
XL
28
29 // The code above optimistically registers a new line *after* each \n
b7449926 30 // it encounters. If that point is already outside the source_file, remove
8faf50e0
XL
31 // it again.
32 if let Some(&last_line_start) = lines.last() {
a1dfa0c6
XL
33 let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
34 assert!(source_file_end >= last_line_start);
35 if last_line_start == source_file_end {
8faf50e0
XL
36 lines.pop();
37 }
38 }
39
40 (lines, multi_byte_chars, non_narrow_chars)
41}
42
9fa01778 43cfg_if::cfg_if! {
487cf647 44 if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
b7449926
XL
45 fn analyze_source_file_dispatch(src: &str,
46 source_file_start_pos: BytePos,
8faf50e0
XL
47 lines: &mut Vec<BytePos>,
48 multi_byte_chars: &mut Vec<MultiByteChar>,
49 non_narrow_chars: &mut Vec<NonNarrowChar>) {
50 if is_x86_feature_detected!("sse2") {
51 unsafe {
b7449926
XL
52 analyze_source_file_sse2(src,
53 source_file_start_pos,
8faf50e0
XL
54 lines,
55 multi_byte_chars,
56 non_narrow_chars);
57 }
58 } else {
b7449926 59 analyze_source_file_generic(src,
8faf50e0 60 src.len(),
b7449926 61 source_file_start_pos,
8faf50e0
XL
62 lines,
63 multi_byte_chars,
64 non_narrow_chars);
65
66 }
67 }
68
9fa01778 69 /// Checks 16 byte chunks of text at a time. If the chunk contains
8faf50e0
XL
70 /// something other than printable ASCII characters and newlines, the
71 /// function falls back to the generic implementation. Otherwise it uses
72 /// SSE2 intrinsics to quickly find all newlines.
73 #[target_feature(enable = "sse2")]
b7449926 74 unsafe fn analyze_source_file_sse2(src: &str,
8faf50e0
XL
75 output_offset: BytePos,
76 lines: &mut Vec<BytePos>,
77 multi_byte_chars: &mut Vec<MultiByteChar>,
78 non_narrow_chars: &mut Vec<NonNarrowChar>) {
79 #[cfg(target_arch = "x86")]
80 use std::arch::x86::*;
81 #[cfg(target_arch = "x86_64")]
82 use std::arch::x86_64::*;
83
84 const CHUNK_SIZE: usize = 16;
85
86 let src_bytes = src.as_bytes();
87
88 let chunk_count = src.len() / CHUNK_SIZE;
89
90 // This variable keeps track of where we should start decoding a
91 // chunk. If a multi-byte character spans across chunk boundaries,
92 // we need to skip that part in the next chunk because we already
93 // handled it.
94 let mut intra_chunk_offset = 0;
95
96 for chunk_index in 0 .. chunk_count {
97 let ptr = src_bytes.as_ptr() as *const __m128i;
98 // We don't know if the pointer is aligned to 16 bytes, so we
99 // use `loadu`, which supports unaligned loading.
fc512014 100 let chunk = _mm_loadu_si128(ptr.add(chunk_index));
8faf50e0
XL
101
102 // For character in the chunk, see if its byte value is < 0, which
103 // indicates that it's part of a UTF-8 char.
104 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
105 // Create a bit mask from the comparison results.
106 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
107
108 // If the bit mask is all zero, we only have ASCII chars here:
109 if multibyte_mask == 0 {
110 assert!(intra_chunk_offset == 0);
111
112 // Check if there are any control characters in the chunk. All
113 // control characters that we can encounter at this point have a
114 // byte value less than 32 or ...
115 let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32));
116 let control_char_mask0 = _mm_movemask_epi8(control_char_test0);
117
118 // ... it's the ASCII 'DEL' character with a value of 127.
119 let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127));
120 let control_char_mask1 = _mm_movemask_epi8(control_char_test1);
121
122 let control_char_mask = control_char_mask0 | control_char_mask1;
123
124 if control_char_mask != 0 {
125 // Check for newlines in the chunk
126 let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
127 let newlines_mask = _mm_movemask_epi8(newlines_test);
128
129 if control_char_mask == newlines_mask {
130 // All control characters are newlines, record them
131 let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
132 let output_offset = output_offset +
133 BytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
134
135 loop {
136 let index = newlines_mask.trailing_zeros();
137
138 if index >= CHUNK_SIZE as u32 {
139 // We have arrived at the end of the chunk.
140 break
141 }
142
143 lines.push(BytePos(index) + output_offset);
144
145 // Clear the bit, so we can find the next one.
146 newlines_mask &= (!1) << index;
147 }
148
149 // We are done for this chunk. All control characters were
150 // newlines and we took care of those.
151 continue
152 } else {
153 // Some of the control characters are not newlines,
154 // fall through to the slow path below.
155 }
156 } else {
157 // No control characters, nothing to record for this chunk
158 continue
159 }
160 }
161
162 // The slow path.
163 // There are control chars in here, fallback to generic decoding.
164 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
b7449926 165 intra_chunk_offset = analyze_source_file_generic(
8faf50e0
XL
166 &src[scan_start .. ],
167 CHUNK_SIZE - intra_chunk_offset,
168 BytePos::from_usize(scan_start) + output_offset,
169 lines,
170 multi_byte_chars,
171 non_narrow_chars
172 );
173 }
174
175 // There might still be a tail left to analyze
176 let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
177 if tail_start < src.len() {
b7449926 178 analyze_source_file_generic(&src[tail_start as usize ..],
8faf50e0
XL
179 src.len() - tail_start,
180 output_offset + BytePos::from_usize(tail_start),
181 lines,
182 multi_byte_chars,
183 non_narrow_chars);
184 }
185 }
186 } else {
187
188 // The target (or compiler version) does not support SSE2 ...
b7449926
XL
189 fn analyze_source_file_dispatch(src: &str,
190 source_file_start_pos: BytePos,
8faf50e0
XL
191 lines: &mut Vec<BytePos>,
192 multi_byte_chars: &mut Vec<MultiByteChar>,
193 non_narrow_chars: &mut Vec<NonNarrowChar>) {
b7449926 194 analyze_source_file_generic(src,
8faf50e0 195 src.len(),
b7449926 196 source_file_start_pos,
8faf50e0
XL
197 lines,
198 multi_byte_chars,
199 non_narrow_chars);
200 }
201 }
202}
203
204// `scan_len` determines the number of bytes in `src` to scan. Note that the
205// function can read past `scan_len` if a multi-byte character start within the
206// range but extends past it. The overflow is returned by the function.
dfeec247
XL
207fn analyze_source_file_generic(
208 src: &str,
209 scan_len: usize,
210 output_offset: BytePos,
211 lines: &mut Vec<BytePos>,
212 multi_byte_chars: &mut Vec<MultiByteChar>,
213 non_narrow_chars: &mut Vec<NonNarrowChar>,
214) -> usize {
8faf50e0
XL
215 assert!(src.len() >= scan_len);
216 let mut i = 0;
217 let src_bytes = src.as_bytes();
218
219 while i < scan_len {
220 let byte = unsafe {
221 // We verified that i < scan_len <= src.len()
222 *src_bytes.get_unchecked(i as usize)
223 };
224
225 // How much to advance in order to get to the next UTF-8 char in the
226 // string.
227 let mut char_len = 1;
228
229 if byte < 32 {
230 // This is an ASCII control character, it could be one of the cases
231 // that are interesting to us.
232
233 let pos = BytePos::from_usize(i) + output_offset;
234
235 match byte {
236 b'\n' => {
237 lines.push(pos + BytePos(1));
238 }
239 b'\t' => {
240 non_narrow_chars.push(NonNarrowChar::Tab(pos));
241 }
242 _ => {
243 non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
244 }
245 }
246 } else if byte >= 127 {
247 // The slow path:
248 // This is either ASCII control character "DEL" or the beginning of
249 // a multibyte char. Just decode to `char`.
487cf647 250 let c = src[i..].chars().next().unwrap();
8faf50e0
XL
251 char_len = c.len_utf8();
252
253 let pos = BytePos::from_usize(i) + output_offset;
254
255 if char_len > 1 {
fc512014 256 assert!((2..=4).contains(&char_len));
dfeec247 257 let mbc = MultiByteChar { pos, bytes: char_len as u8 };
8faf50e0
XL
258 multi_byte_chars.push(mbc);
259 }
260
261 // Assume control characters are zero width.
262 // FIXME: How can we decide between `width` and `width_cjk`?
263 let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
264
265 if char_width != 1 {
266 non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
267 }
268 }
269
270 i += char_len;
271 }
272
273 i - scan_len
274}