compiler/rustc_span/src/analyze_source_file.rs

   1 use super::*;
   2 use unicode_width::UnicodeWidthChar;
   3
   4 #[cfg(test)]
   5 mod tests;
   6
   7 /// Finds all newlines, multi-byte characters, and non-narrow characters in a
   8 /// SourceFile.
   9 ///
  10 /// This function will use an SSE2 enhanced implementation if hardware support
  11 /// is detected at runtime.
  12 pub fn analyze_source_file(
  13     src: &str,
  14     source_file_start_pos: BytePos,
  15 ) -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
  16     let mut lines = vec![source_file_start_pos];
  17     let mut multi_byte_chars = vec![];
  18     let mut non_narrow_chars = vec![];
  19
  20     // Calls the right implementation, depending on hardware support available.
  21     analyze_source_file_dispatch(
  22         src,
  23         source_file_start_pos,
  24         &mut lines,
  25         &mut multi_byte_chars,
  26         &mut non_narrow_chars,
  27     );
  28
  29     // The code above optimistically registers a new line *after* each \n
  30     // it encounters. If that point is already outside the source_file, remove
  31     // it again.
  32     if let Some(&last_line_start) = lines.last() {
  33         let source_file_end = source_file_start_pos + BytePos::from_usize(src.len());
  34         assert!(source_file_end >= last_line_start);
  35         if last_line_start == source_file_end {
  36             lines.pop();
  37         }
  38     }
  39
  40     (lines, multi_byte_chars, non_narrow_chars)
  41 }
  42
  43 cfg_if::cfg_if! {
  44     if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] {
  45         fn analyze_source_file_dispatch(src: &str,
  46                                     source_file_start_pos: BytePos,
  47                                     lines: &mut Vec<BytePos>,
  48                                     multi_byte_chars: &mut Vec<MultiByteChar>,
  49                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
  50             if is_x86_feature_detected!("sse2") {
  51                 unsafe {
  52                     analyze_source_file_sse2(src,
  53                                          source_file_start_pos,
  54                                          lines,
  55                                          multi_byte_chars,
  56                                          non_narrow_chars);
  57                 }
  58             } else {
  59                 analyze_source_file_generic(src,
  60                                         src.len(),
  61                                         source_file_start_pos,
  62                                         lines,
  63                                         multi_byte_chars,
  64                                         non_narrow_chars);
  65
  66             }
  67         }
  68
  69         /// Checks 16 byte chunks of text at a time. If the chunk contains
  70         /// something other than printable ASCII characters and newlines, the
  71         /// function falls back to the generic implementation. Otherwise it uses
  72         /// SSE2 intrinsics to quickly find all newlines.
  73         #[target_feature(enable = "sse2")]
  74         unsafe fn analyze_source_file_sse2(src: &str,
  75                                        output_offset: BytePos,
  76                                        lines: &mut Vec<BytePos>,
  77                                        multi_byte_chars: &mut Vec<MultiByteChar>,
  78                                        non_narrow_chars: &mut Vec<NonNarrowChar>) {
  79             #[cfg(target_arch = "x86")]
  80             use std::arch::x86::*;
  81             #[cfg(target_arch = "x86_64")]
  82             use std::arch::x86_64::*;
  83
  84             const CHUNK_SIZE: usize = 16;
  85
  86             let src_bytes = src.as_bytes();
  87
  88             let chunk_count = src.len() / CHUNK_SIZE;
  89
  90             // This variable keeps track of where we should start decoding a
  91             // chunk. If a multi-byte character spans across chunk boundaries,
  92             // we need to skip that part in the next chunk because we already
  93             // handled it.
  94             let mut intra_chunk_offset = 0;
  95
  96             for chunk_index in 0 .. chunk_count {
  97                 let ptr = src_bytes.as_ptr() as *const __m128i;
  98                 // We don't know if the pointer is aligned to 16 bytes, so we
  99                 // use `loadu`, which supports unaligned loading.
 100                 let chunk = _mm_loadu_si128(ptr.add(chunk_index));
 101
 102                 // For character in the chunk, see if its byte value is < 0, which
 103                 // indicates that it's part of a UTF-8 char.
 104                 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
 105                 // Create a bit mask from the comparison results.
 106                 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
 107
 108                 // If the bit mask is all zero, we only have ASCII chars here:
 109                 if multibyte_mask == 0 {
 110                     assert!(intra_chunk_offset == 0);
 111
 112                     // Check if there are any control characters in the chunk. All
 113                     // control characters that we can encounter at this point have a
 114                     // byte value less than 32 or ...
 115                     let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32));
 116                     let control_char_mask0 = _mm_movemask_epi8(control_char_test0);
 117
 118                     // ... it's the ASCII 'DEL' character with a value of 127.
 119                     let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127));
 120                     let control_char_mask1 = _mm_movemask_epi8(control_char_test1);
 121
 122                     let control_char_mask = control_char_mask0 | control_char_mask1;
 123
 124                     if control_char_mask != 0 {
 125                         // Check for newlines in the chunk
 126                         let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
 127                         let newlines_mask = _mm_movemask_epi8(newlines_test);
 128
 129                         if control_char_mask == newlines_mask {
 130                             // All control characters are newlines, record them
 131                             let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
 132                             let output_offset = output_offset +
 133                                 BytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
 134
 135                             loop {
 136                                 let index = newlines_mask.trailing_zeros();
 137
 138                                 if index >= CHUNK_SIZE as u32 {
 139                                     // We have arrived at the end of the chunk.
 140                                     break
 141                                 }
 142
 143                                 lines.push(BytePos(index) + output_offset);
 144
 145                                 // Clear the bit, so we can find the next one.
 146                                 newlines_mask &= (!1) << index;
 147                             }
 148
 149                             // We are done for this chunk. All control characters were
 150                             // newlines and we took care of those.
 151                             continue
 152                         } else {
 153                             // Some of the control characters are not newlines,
 154                             // fall through to the slow path below.
 155                         }
 156                     } else {
 157                         // No control characters, nothing to record for this chunk
 158                         continue
 159                     }
 160                 }
 161
 162                 // The slow path.
 163                 // There are control chars in here, fallback to generic decoding.
 164                 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
 165                 intra_chunk_offset = analyze_source_file_generic(
 166                     &src[scan_start .. ],
 167                     CHUNK_SIZE - intra_chunk_offset,
 168                     BytePos::from_usize(scan_start) + output_offset,
 169                     lines,
 170                     multi_byte_chars,
 171                     non_narrow_chars
 172                 );
 173             }
 174
 175             // There might still be a tail left to analyze
 176             let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
 177             if tail_start < src.len() {
 178                 analyze_source_file_generic(&src[tail_start as usize ..],
 179                                         src.len() - tail_start,
 180                                         output_offset + BytePos::from_usize(tail_start),
 181                                         lines,
 182                                         multi_byte_chars,
 183                                         non_narrow_chars);
 184             }
 185         }
 186     } else {
 187
 188         // The target (or compiler version) does not support SSE2 ...
 189         fn analyze_source_file_dispatch(src: &str,
 190                                     source_file_start_pos: BytePos,
 191                                     lines: &mut Vec<BytePos>,
 192                                     multi_byte_chars: &mut Vec<MultiByteChar>,
 193                                     non_narrow_chars: &mut Vec<NonNarrowChar>) {
 194             analyze_source_file_generic(src,
 195                                     src.len(),
 196                                     source_file_start_pos,
 197                                     lines,
 198                                     multi_byte_chars,
 199                                     non_narrow_chars);
 200         }
 201     }
 202 }
 203
 204 // `scan_len` determines the number of bytes in `src` to scan. Note that the
 205 // function can read past `scan_len` if a multi-byte character start within the
 206 // range but extends past it. The overflow is returned by the function.
 207 fn analyze_source_file_generic(
 208     src: &str,
 209     scan_len: usize,
 210     output_offset: BytePos,
 211     lines: &mut Vec<BytePos>,
 212     multi_byte_chars: &mut Vec<MultiByteChar>,
 213     non_narrow_chars: &mut Vec<NonNarrowChar>,
 214 ) -> usize {
 215     assert!(src.len() >= scan_len);
 216     let mut i = 0;
 217     let src_bytes = src.as_bytes();
 218
 219     while i < scan_len {
 220         let byte = unsafe {
 221             // We verified that i < scan_len <= src.len()
 222             *src_bytes.get_unchecked(i as usize)
 223         };
 224
 225         // How much to advance in order to get to the next UTF-8 char in the
 226         // string.
 227         let mut char_len = 1;
 228
 229         if byte < 32 {
 230             // This is an ASCII control character, it could be one of the cases
 231             // that are interesting to us.
 232
 233             let pos = BytePos::from_usize(i) + output_offset;
 234
 235             match byte {
 236                 b'\n' => {
 237                     lines.push(pos + BytePos(1));
 238                 }
 239                 b'\t' => {
 240                     non_narrow_chars.push(NonNarrowChar::Tab(pos));
 241                 }
 242                 _ => {
 243                     non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
 244                 }
 245             }
 246         } else if byte >= 127 {
 247             // The slow path:
 248             // This is either ASCII control character "DEL" or the beginning of
 249             // a multibyte char. Just decode to `char`.
 250             let c = (&src[i..]).chars().next().unwrap();
 251             char_len = c.len_utf8();
 252
 253             let pos = BytePos::from_usize(i) + output_offset;
 254
 255             if char_len > 1 {
 256                 assert!((2..=4).contains(&char_len));
 257                 let mbc = MultiByteChar { pos, bytes: char_len as u8 };
 258                 multi_byte_chars.push(mbc);
 259             }
 260
 261             // Assume control characters are zero width.
 262             // FIXME: How can we decide between `width` and `width_cjk`?
 263             let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
 264
 265             if char_width != 1 {
 266                 non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
 267             }
 268         }
 269
 270         i += char_len;
 271     }
 272
 273     i - scan_len
 274 }