]> git.proxmox.com Git - rustc.git/blob - vendor/imara-diff/src/myers/preprocess.rs
New upstream version 1.70.0+dfsg2
[rustc.git] / vendor / imara-diff / src / myers / preprocess.rs
1 use crate::intern::Token;
2 use crate::myers::sqrt;
3 use crate::util::{strip_common_postfix, strip_common_prefix};
4
5 pub fn preprocess(
6 mut file1: &[Token],
7 mut file2: &[Token],
8 ) -> (PreprocessedFile, PreprocessedFile) {
9 let common_prefix = strip_common_prefix(&mut file1, &mut file2);
10 strip_common_postfix(&mut file1, &mut file2);
11 let (hdiff1, hdiff2) = token_occurrences(file1, file2);
12 let file1 = PreprocessedFile::new(common_prefix, &hdiff1, file1);
13 let file2 = PreprocessedFile::new(common_prefix, &hdiff2, file2);
14 (file1, file2)
15 }
16
17 /// computes how
18 fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec<Occurances>, Vec<Occurances>) {
19 const MAX_EQLIMIT: u32 = 1024;
20
21 // compute the limit after which tokens are treated as `Occurances::COMMON`
22 let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT);
23 let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT);
24
25 // first collect how often each token occurs in a file
26 let mut occurances1 = Vec::new();
27 for token in file1 {
28 let bucket = token.0 as usize;
29 if bucket >= occurances1.len() {
30 occurances1.resize(bucket + 1, 0u32);
31 }
32 occurances1[bucket] += 1;
33 }
34
35 // do the same thing for
36 let mut occurances2 = Vec::new();
37 let token_occurances2: Vec<_> = file2
38 .iter()
39 .map(|token| {
40 let bucket = token.0 as usize;
41 if bucket >= occurances2.len() {
42 occurances2.resize(bucket + 1, 0);
43 }
44 occurances2[bucket] += 1;
45 let occurances1 = *occurances1.get(bucket).unwrap_or(&0);
46 Occurances::from_occurances(occurances1, eqlimit2)
47 })
48 .collect();
49
50 let token_occurances1: Vec<_> = file1
51 .iter()
52 .map(|token| {
53 let bucket = token.0 as usize;
54 let occurances2 = *occurances2.get(bucket).unwrap_or(&0);
55 Occurances::from_occurances(occurances2, eqlimit1)
56 })
57 .collect();
58
59 (token_occurances1, token_occurances2)
60 }
61
62 #[derive(Clone, Copy, Debug)]
63 enum Occurances {
64 /// Token does not occur in this file
65 None,
66 /// Token occurs at least once
67 Some,
68 /// Token occurs very frequently (exact number depends on file size).
69 /// Such a tokens are usually empty lines or braces and are often not meaningful to a diff
70 Common,
71 }
72
73 impl Occurances {
74 pub fn from_occurances(occurances: u32, eqlimit: u32) -> Occurances {
75 if occurances == 0 {
76 Occurances::None
77 } else if occurances >= eqlimit {
78 Occurances::Common
79 } else {
80 Occurances::Some
81 }
82 }
83 }
84
85 #[derive(Debug)]
86 pub struct PreprocessedFile {
87 pub offset: u32,
88 pub is_changed: Vec<bool>,
89 pub indices: Vec<u32>,
90 pub tokens: Vec<Token>,
91 }
92
93 impl PreprocessedFile {
94 fn new(offset: u32, token_diff: &[Occurances], tokens: &[Token]) -> PreprocessedFile {
95 let mut changed = vec![false; tokens.len()];
96 let (tokens, indices) = prune_unmatched_tokens(tokens, token_diff, &mut changed);
97 PreprocessedFile { offset, is_changed: changed, indices, tokens }
98 }
99 }
100
101 fn prune_unmatched_tokens(
102 file: &[Token],
103 token_status: &[Occurances],
104 changed: &mut [bool],
105 ) -> (Vec<Token>, Vec<u32>) {
106 assert_eq!(token_status.len(), file.len());
107 file.iter()
108 .zip(token_status)
109 .enumerate()
110 .filter_map(|(i, (&token, &status))| {
111 let prune = match status {
112 Occurances::None => true,
113 Occurances::Some => false,
114 Occurances::Common => should_prune_common_line(token_status, i),
115 };
116 if prune {
117 changed[i] = true;
118 None
119 } else {
120 Some((token, i as u32))
121 }
122 })
123 .unzip()
124 }
125
126 // TODO do not unnecessarily rescan lines
127 fn should_prune_common_line(token_status: &[Occurances], pos: usize) -> bool {
128 const WINDOW_SIZE: usize = 100;
129
130 let mut unmatched_before = 0;
131 let mut common_before = 0;
132
133 let start = if pos > WINDOW_SIZE { WINDOW_SIZE } else { 0 };
134 for status in token_status[start..pos].iter().rev() {
135 match status {
136 Occurances::None => {
137 unmatched_before += 1;
138 }
139 Occurances::Common => {
140 common_before += 1;
141 }
142 Occurances::Some => break,
143 }
144 }
145
146 if unmatched_before == 0 {
147 return false;
148 }
149
150 let end = token_status.len().min(pos + WINDOW_SIZE);
151 let mut unmatched_after = 0;
152 let mut common_after = 0;
153 for status in token_status[pos..end].iter() {
154 match status {
155 Occurances::None => {
156 unmatched_after += 1;
157 }
158 Occurances::Common => {
159 common_after += 1;
160 }
161 Occurances::Some => break,
162 }
163 }
164
165 if unmatched_after == 0 {
166 return false;
167 }
168
169 let common = common_before + common_after;
170 let unmatched = unmatched_before + unmatched_after;
171
172 unmatched > 3 * common
173 }