2 use unicode_width
::UnicodeWidthChar
;
7 /// Finds all newlines, multi-byte characters, and non-narrow characters in a
10 /// This function will use an SSE2 enhanced implementation if hardware support
11 /// is detected at runtime.
12 pub fn analyze_source_file(
14 source_file_start_pos
: BytePos
,
15 ) -> (Vec
<BytePos
>, Vec
<MultiByteChar
>, Vec
<NonNarrowChar
>) {
16 let mut lines
= vec
![source_file_start_pos
];
17 let mut multi_byte_chars
= vec
![];
18 let mut non_narrow_chars
= vec
![];
20 // Calls the right implementation, depending on hardware support available.
21 analyze_source_file_dispatch(
23 source_file_start_pos
,
25 &mut multi_byte_chars
,
26 &mut non_narrow_chars
,
29 // The code above optimistically registers a new line *after* each \n
30 // it encounters. If that point is already outside the source_file, remove
32 if let Some(&last_line_start
) = lines
.last() {
33 let source_file_end
= source_file_start_pos
+ BytePos
::from_usize(src
.len());
34 assert
!(source_file_end
>= last_line_start
);
35 if last_line_start
== source_file_end
{
40 (lines
, multi_byte_chars
, non_narrow_chars
)
44 if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] {
45 fn analyze_source_file_dispatch(src
: &str,
46 source_file_start_pos
: BytePos
,
47 lines
: &mut Vec
<BytePos
>,
48 multi_byte_chars
: &mut Vec
<MultiByteChar
>,
49 non_narrow_chars
: &mut Vec
<NonNarrowChar
>) {
50 if is_x86_feature_detected
!("sse2") {
52 analyze_source_file_sse2(src
,
53 source_file_start_pos
,
59 analyze_source_file_generic(src
,
61 source_file_start_pos
,
69 /// Checks 16 byte chunks of text at a time. If the chunk contains
70 /// something other than printable ASCII characters and newlines, the
71 /// function falls back to the generic implementation. Otherwise it uses
72 /// SSE2 intrinsics to quickly find all newlines.
73 #[target_feature(enable = "sse2")]
74 unsafe fn analyze_source_file_sse2(src
: &str,
75 output_offset
: BytePos
,
76 lines
: &mut Vec
<BytePos
>,
77 multi_byte_chars
: &mut Vec
<MultiByteChar
>,
78 non_narrow_chars
: &mut Vec
<NonNarrowChar
>) {
79 #[cfg(target_arch = "x86")]
80 use std
::arch
::x86
::*;
81 #[cfg(target_arch = "x86_64")]
82 use std
::arch
::x86_64
::*;
84 const CHUNK_SIZE
: usize = 16;
86 let src_bytes
= src
.as_bytes();
88 let chunk_count
= src
.len() / CHUNK_SIZE
;
90 // This variable keeps track of where we should start decoding a
91 // chunk. If a multi-byte character spans across chunk boundaries,
92 // we need to skip that part in the next chunk because we already
94 let mut intra_chunk_offset
= 0;
96 for chunk_index
in 0 .. chunk_count
{
97 let ptr
= src_bytes
.as_ptr() as *const __m128i
;
98 // We don't know if the pointer is aligned to 16 bytes, so we
99 // use `loadu`, which supports unaligned loading.
100 let chunk
= _mm_loadu_si128(ptr
.add(chunk_index
));
102 // For character in the chunk, see if its byte value is < 0, which
103 // indicates that it's part of a UTF-8 char.
104 let multibyte_test
= _mm_cmplt_epi8(chunk
, _mm_set1_epi8(0));
105 // Create a bit mask from the comparison results.
106 let multibyte_mask
= _mm_movemask_epi8(multibyte_test
);
108 // If the bit mask is all zero, we only have ASCII chars here:
109 if multibyte_mask
== 0 {
110 assert
!(intra_chunk_offset
== 0);
112 // Check if there are any control characters in the chunk. All
113 // control characters that we can encounter at this point have a
114 // byte value less than 32 or ...
115 let control_char_test0
= _mm_cmplt_epi8(chunk
, _mm_set1_epi8(32));
116 let control_char_mask0
= _mm_movemask_epi8(control_char_test0
);
118 // ... it's the ASCII 'DEL' character with a value of 127.
119 let control_char_test1
= _mm_cmpeq_epi8(chunk
, _mm_set1_epi8(127));
120 let control_char_mask1
= _mm_movemask_epi8(control_char_test1
);
122 let control_char_mask
= control_char_mask0
| control_char_mask1
;
124 if control_char_mask
!= 0 {
125 // Check for newlines in the chunk
126 let newlines_test
= _mm_cmpeq_epi8(chunk
, _mm_set1_epi8(b'
\n'
as i8));
127 let newlines_mask
= _mm_movemask_epi8(newlines_test
);
129 if control_char_mask
== newlines_mask
{
130 // All control characters are newlines, record them
131 let mut newlines_mask
= 0xFFFF0000 | newlines_mask
as u32;
132 let output_offset
= output_offset
+
133 BytePos
::from_usize(chunk_index
* CHUNK_SIZE
+ 1);
136 let index
= newlines_mask
.trailing_zeros();
138 if index
>= CHUNK_SIZE
as u32 {
139 // We have arrived at the end of the chunk.
143 lines
.push(BytePos(index
) + output_offset
);
145 // Clear the bit, so we can find the next one.
146 newlines_mask
&= (!1) << index
;
149 // We are done for this chunk. All control characters were
150 // newlines and we took care of those.
153 // Some of the control characters are not newlines,
154 // fall through to the slow path below.
157 // No control characters, nothing to record for this chunk
163 // There are control chars in here, fallback to generic decoding.
164 let scan_start
= chunk_index
* CHUNK_SIZE
+ intra_chunk_offset
;
165 intra_chunk_offset
= analyze_source_file_generic(
166 &src
[scan_start
.. ],
167 CHUNK_SIZE
- intra_chunk_offset
,
168 BytePos
::from_usize(scan_start
) + output_offset
,
175 // There might still be a tail left to analyze
176 let tail_start
= chunk_count
* CHUNK_SIZE
+ intra_chunk_offset
;
177 if tail_start
< src
.len() {
178 analyze_source_file_generic(&src
[tail_start
as usize ..],
179 src
.len() - tail_start
,
180 output_offset
+ BytePos
::from_usize(tail_start
),
188 // The target (or compiler version) does not support SSE2 ...
189 fn analyze_source_file_dispatch(src
: &str,
190 source_file_start_pos
: BytePos
,
191 lines
: &mut Vec
<BytePos
>,
192 multi_byte_chars
: &mut Vec
<MultiByteChar
>,
193 non_narrow_chars
: &mut Vec
<NonNarrowChar
>) {
194 analyze_source_file_generic(src
,
196 source_file_start_pos
,
204 // `scan_len` determines the number of bytes in `src` to scan. Note that the
205 // function can read past `scan_len` if a multi-byte character start within the
206 // range but extends past it. The overflow is returned by the function.
207 fn analyze_source_file_generic(
210 output_offset
: BytePos
,
211 lines
: &mut Vec
<BytePos
>,
212 multi_byte_chars
: &mut Vec
<MultiByteChar
>,
213 non_narrow_chars
: &mut Vec
<NonNarrowChar
>,
215 assert
!(src
.len() >= scan_len
);
217 let src_bytes
= src
.as_bytes();
221 // We verified that i < scan_len <= src.len()
222 *src_bytes
.get_unchecked(i
as usize)
225 // How much to advance in order to get to the next UTF-8 char in the
227 let mut char_len
= 1;
230 // This is an ASCII control character, it could be one of the cases
231 // that are interesting to us.
233 let pos
= BytePos
::from_usize(i
) + output_offset
;
237 lines
.push(pos
+ BytePos(1));
240 non_narrow_chars
.push(NonNarrowChar
::Tab(pos
));
243 non_narrow_chars
.push(NonNarrowChar
::ZeroWidth(pos
));
246 } else if byte
>= 127 {
248 // This is either ASCII control character "DEL" or the beginning of
249 // a multibyte char. Just decode to `char`.
250 let c
= (&src
[i
..]).chars().next().unwrap();
251 char_len
= c
.len_utf8();
253 let pos
= BytePos
::from_usize(i
) + output_offset
;
256 assert
!((2..=4).contains(&char_len
));
257 let mbc
= MultiByteChar { pos, bytes: char_len as u8 }
;
258 multi_byte_chars
.push(mbc
);
261 // Assume control characters are zero width.
262 // FIXME: How can we decide between `width` and `width_cjk`?
263 let char_width
= UnicodeWidthChar
::width(c
).unwrap_or(0);
266 non_narrow_chars
.push(NonNarrowChar
::new(pos
, char_width
));