vendor/imara-diff/src/myers/preprocess.rs

   1 use crate::intern::Token;
   2 use crate::myers::sqrt;
   3 use crate::util::{strip_common_postfix, strip_common_prefix};
   4
   5 pub fn preprocess(
   6     mut file1: &[Token],
   7     mut file2: &[Token],
   8 ) -> (PreprocessedFile, PreprocessedFile) {
   9     let common_prefix = strip_common_prefix(&mut file1, &mut file2);
  10     strip_common_postfix(&mut file1, &mut file2);
  11     let (hdiff1, hdiff2) = token_occurrences(file1, file2);
  12     let file1 = PreprocessedFile::new(common_prefix, &hdiff1, file1);
  13     let file2 = PreprocessedFile::new(common_prefix, &hdiff2, file2);
  14     (file1, file2)
  15 }
  16
  17 /// computes how
  18 fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec<Occurances>, Vec<Occurances>) {
  19     const MAX_EQLIMIT: u32 = 1024;
  20
  21     // compute the limit after which tokens are treated as `Occurances::COMMON`
  22     let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT);
  23     let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT);
  24
  25     // first collect how often each token occurs in a file
  26     let mut occurances1 = Vec::new();
  27     for token in file1 {
  28         let bucket = token.0 as usize;
  29         if bucket >= occurances1.len() {
  30             occurances1.resize(bucket + 1, 0u32);
  31         }
  32         occurances1[bucket] += 1;
  33     }
  34
  35     // do the same thing for
  36     let mut occurances2 = Vec::new();
  37     let token_occurances2: Vec<_> = file2
  38         .iter()
  39         .map(|token| {
  40             let bucket = token.0 as usize;
  41             if bucket >= occurances2.len() {
  42                 occurances2.resize(bucket + 1, 0);
  43             }
  44             occurances2[bucket] += 1;
  45             let occurances1 = *occurances1.get(bucket).unwrap_or(&0);
  46             Occurances::from_occurances(occurances1, eqlimit2)
  47         })
  48         .collect();
  49
  50     let token_occurances1: Vec<_> = file1
  51         .iter()
  52         .map(|token| {
  53             let bucket = token.0 as usize;
  54             let occurances2 = *occurances2.get(bucket).unwrap_or(&0);
  55             Occurances::from_occurances(occurances2, eqlimit1)
  56         })
  57         .collect();
  58
  59     (token_occurances1, token_occurances2)
  60 }
  61
  62 #[derive(Clone, Copy, Debug)]
  63 enum Occurances {
  64     /// Token does not occur in this file
  65     None,
  66     /// Token occurs at least once
  67     Some,
  68     /// Token occurs very frequently (exact number depends on file size).
  69     /// Such a tokens are usually empty lines or braces and are often not meaningful to a diff
  70     Common,
  71 }
  72
  73 impl Occurances {
  74     pub fn from_occurances(occurances: u32, eqlimit: u32) -> Occurances {
  75         if occurances == 0 {
  76             Occurances::None
  77         } else if occurances >= eqlimit {
  78             Occurances::Common
  79         } else {
  80             Occurances::Some
  81         }
  82     }
  83 }
  84
  85 #[derive(Debug)]
  86 pub struct PreprocessedFile {
  87     pub offset: u32,
  88     pub is_changed: Vec<bool>,
  89     pub indices: Vec<u32>,
  90     pub tokens: Vec<Token>,
  91 }
  92
  93 impl PreprocessedFile {
  94     fn new(offset: u32, token_diff: &[Occurances], tokens: &[Token]) -> PreprocessedFile {
  95         let mut changed = vec![false; tokens.len()];
  96         let (tokens, indices) = prune_unmatched_tokens(tokens, token_diff, &mut changed);
  97         PreprocessedFile { offset, is_changed: changed, indices, tokens }
  98     }
  99 }
 100
 101 fn prune_unmatched_tokens(
 102     file: &[Token],
 103     token_status: &[Occurances],
 104     changed: &mut [bool],
 105 ) -> (Vec<Token>, Vec<u32>) {
 106     assert_eq!(token_status.len(), file.len());
 107     file.iter()
 108         .zip(token_status)
 109         .enumerate()
 110         .filter_map(|(i, (&token, &status))| {
 111             let prune = match status {
 112                 Occurances::None => true,
 113                 Occurances::Some => false,
 114                 Occurances::Common => should_prune_common_line(token_status, i),
 115             };
 116             if prune {
 117                 changed[i] = true;
 118                 None
 119             } else {
 120                 Some((token, i as u32))
 121             }
 122         })
 123         .unzip()
 124 }
 125
 126 // TODO do not unnecessarily rescan lines
 127 fn should_prune_common_line(token_status: &[Occurances], pos: usize) -> bool {
 128     const WINDOW_SIZE: usize = 100;
 129
 130     let mut unmatched_before = 0;
 131     let mut common_before = 0;
 132
 133     let start = if pos > WINDOW_SIZE { WINDOW_SIZE } else { 0 };
 134     for status in token_status[start..pos].iter().rev() {
 135         match status {
 136             Occurances::None => {
 137                 unmatched_before += 1;
 138             }
 139             Occurances::Common => {
 140                 common_before += 1;
 141             }
 142             Occurances::Some => break,
 143         }
 144     }
 145
 146     if unmatched_before == 0 {
 147         return false;
 148     }
 149
 150     let end = token_status.len().min(pos + WINDOW_SIZE);
 151     let mut unmatched_after = 0;
 152     let mut common_after = 0;
 153     for status in token_status[pos..end].iter() {
 154         match status {
 155             Occurances::None => {
 156                 unmatched_after += 1;
 157             }
 158             Occurances::Common => {
 159                 common_after += 1;
 160             }
 161             Occurances::Some => break,
 162         }
 163     }
 164
 165     if unmatched_after == 0 {
 166         return false;
 167     }
 168
 169     let common = common_before + common_after;
 170     let unmatched = unmatched_before + unmatched_after;
 171
 172     unmatched > 3 * common
 173 }