]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | #! /usr/bin/env python\r |
2 | \r | |
3 | """\r | |
4 | Module difflib -- helpers for computing deltas between objects.\r | |
5 | \r | |
6 | Function get_close_matches(word, possibilities, n=3, cutoff=0.6):\r | |
7 | Use SequenceMatcher to return list of the best "good enough" matches.\r | |
8 | \r | |
9 | Function context_diff(a, b):\r | |
10 | For two lists of strings, return a delta in context diff format.\r | |
11 | \r | |
12 | Function ndiff(a, b):\r | |
13 | Return a delta: the difference between `a` and `b` (lists of strings).\r | |
14 | \r | |
15 | Function restore(delta, which):\r | |
16 | Return one of the two sequences that generated an ndiff delta.\r | |
17 | \r | |
18 | Function unified_diff(a, b):\r | |
19 | For two lists of strings, return a delta in unified diff format.\r | |
20 | \r | |
21 | Class SequenceMatcher:\r | |
22 | A flexible class for comparing pairs of sequences of any type.\r | |
23 | \r | |
24 | Class Differ:\r | |
25 | For producing human-readable deltas from sequences of lines of text.\r | |
26 | \r | |
27 | Class HtmlDiff:\r | |
28 | For producing HTML side by side comparison with change highlights.\r | |
29 | """\r | |
30 | \r | |
31 | __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',\r | |
32 | 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',\r | |
33 | 'unified_diff', 'HtmlDiff', 'Match']\r | |
34 | \r | |
35 | import heapq\r | |
36 | from collections import namedtuple as _namedtuple\r | |
37 | from functools import reduce\r | |
38 | \r | |
39 | Match = _namedtuple('Match', 'a b size')\r | |
40 | \r | |
41 | def _calculate_ratio(matches, length):\r | |
42 | if length:\r | |
43 | return 2.0 * matches / length\r | |
44 | return 1.0\r | |
45 | \r | |
46 | class SequenceMatcher:\r | |
47 | \r | |
48 | """\r | |
49 | SequenceMatcher is a flexible class for comparing pairs of sequences of\r | |
50 | any type, so long as the sequence elements are hashable. The basic\r | |
51 | algorithm predates, and is a little fancier than, an algorithm\r | |
52 | published in the late 1980's by Ratcliff and Obershelp under the\r | |
53 | hyperbolic name "gestalt pattern matching". The basic idea is to find\r | |
54 | the longest contiguous matching subsequence that contains no "junk"\r | |
55 | elements (R-O doesn't address junk). The same idea is then applied\r | |
56 | recursively to the pieces of the sequences to the left and to the right\r | |
57 | of the matching subsequence. This does not yield minimal edit\r | |
58 | sequences, but does tend to yield matches that "look right" to people.\r | |
59 | \r | |
60 | SequenceMatcher tries to compute a "human-friendly diff" between two\r | |
61 | sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the\r | |
62 | longest *contiguous* & junk-free matching subsequence. That's what\r | |
63 | catches peoples' eyes. The Windows(tm) windiff has another interesting\r | |
64 | notion, pairing up elements that appear uniquely in each sequence.\r | |
65 | That, and the method here, appear to yield more intuitive difference\r | |
66 | reports than does diff. This method appears to be the least vulnerable\r | |
67 | to synching up on blocks of "junk lines", though (like blank lines in\r | |
68 | ordinary text files, or maybe "<P>" lines in HTML files). That may be\r | |
69 | because this is the only method of the 3 that has a *concept* of\r | |
70 | "junk" <wink>.\r | |
71 | \r | |
72 | Example, comparing two strings, and considering blanks to be "junk":\r | |
73 | \r | |
74 | >>> s = SequenceMatcher(lambda x: x == " ",\r | |
75 | ... "private Thread currentThread;",\r | |
76 | ... "private volatile Thread currentThread;")\r | |
77 | >>>\r | |
78 | \r | |
79 | .ratio() returns a float in [0, 1], measuring the "similarity" of the\r | |
80 | sequences. As a rule of thumb, a .ratio() value over 0.6 means the\r | |
81 | sequences are close matches:\r | |
82 | \r | |
83 | >>> print round(s.ratio(), 3)\r | |
84 | 0.866\r | |
85 | >>>\r | |
86 | \r | |
87 | If you're only interested in where the sequences match,\r | |
88 | .get_matching_blocks() is handy:\r | |
89 | \r | |
90 | >>> for block in s.get_matching_blocks():\r | |
91 | ... print "a[%d] and b[%d] match for %d elements" % block\r | |
92 | a[0] and b[0] match for 8 elements\r | |
93 | a[8] and b[17] match for 21 elements\r | |
94 | a[29] and b[38] match for 0 elements\r | |
95 | \r | |
96 | Note that the last tuple returned by .get_matching_blocks() is always a\r | |
97 | dummy, (len(a), len(b), 0), and this is the only case in which the last\r | |
98 | tuple element (number of elements matched) is 0.\r | |
99 | \r | |
100 | If you want to know how to change the first sequence into the second,\r | |
101 | use .get_opcodes():\r | |
102 | \r | |
103 | >>> for opcode in s.get_opcodes():\r | |
104 | ... print "%6s a[%d:%d] b[%d:%d]" % opcode\r | |
105 | equal a[0:8] b[0:8]\r | |
106 | insert a[8:8] b[8:17]\r | |
107 | equal a[8:29] b[17:38]\r | |
108 | \r | |
109 | See the Differ class for a fancy human-friendly file differencer, which\r | |
110 | uses SequenceMatcher both to compare sequences of lines, and to compare\r | |
111 | sequences of characters within similar (near-matching) lines.\r | |
112 | \r | |
113 | See also function get_close_matches() in this module, which shows how\r | |
114 | simple code building on SequenceMatcher can be used to do useful work.\r | |
115 | \r | |
116 | Timing: Basic R-O is cubic time worst case and quadratic time expected\r | |
117 | case. SequenceMatcher is quadratic time for the worst case and has\r | |
118 | expected-case behavior dependent in a complicated way on how many\r | |
119 | elements the sequences have in common; best case time is linear.\r | |
120 | \r | |
121 | Methods:\r | |
122 | \r | |
123 | __init__(isjunk=None, a='', b='')\r | |
124 | Construct a SequenceMatcher.\r | |
125 | \r | |
126 | set_seqs(a, b)\r | |
127 | Set the two sequences to be compared.\r | |
128 | \r | |
129 | set_seq1(a)\r | |
130 | Set the first sequence to be compared.\r | |
131 | \r | |
132 | set_seq2(b)\r | |
133 | Set the second sequence to be compared.\r | |
134 | \r | |
135 | find_longest_match(alo, ahi, blo, bhi)\r | |
136 | Find longest matching block in a[alo:ahi] and b[blo:bhi].\r | |
137 | \r | |
138 | get_matching_blocks()\r | |
139 | Return list of triples describing matching subsequences.\r | |
140 | \r | |
141 | get_opcodes()\r | |
142 | Return list of 5-tuples describing how to turn a into b.\r | |
143 | \r | |
144 | ratio()\r | |
145 | Return a measure of the sequences' similarity (float in [0,1]).\r | |
146 | \r | |
147 | quick_ratio()\r | |
148 | Return an upper bound on .ratio() relatively quickly.\r | |
149 | \r | |
150 | real_quick_ratio()\r | |
151 | Return an upper bound on ratio() very quickly.\r | |
152 | """\r | |
153 | \r | |
154 | def __init__(self, isjunk=None, a='', b='', autojunk=True):\r | |
155 | """Construct a SequenceMatcher.\r | |
156 | \r | |
157 | Optional arg isjunk is None (the default), or a one-argument\r | |
158 | function that takes a sequence element and returns true iff the\r | |
159 | element is junk. None is equivalent to passing "lambda x: 0", i.e.\r | |
160 | no elements are considered to be junk. For example, pass\r | |
161 | lambda x: x in " \\t"\r | |
162 | if you're comparing lines as sequences of characters, and don't\r | |
163 | want to synch up on blanks or hard tabs.\r | |
164 | \r | |
165 | Optional arg a is the first of two sequences to be compared. By\r | |
166 | default, an empty string. The elements of a must be hashable. See\r | |
167 | also .set_seqs() and .set_seq1().\r | |
168 | \r | |
169 | Optional arg b is the second of two sequences to be compared. By\r | |
170 | default, an empty string. The elements of b must be hashable. See\r | |
171 | also .set_seqs() and .set_seq2().\r | |
172 | \r | |
173 | Optional arg autojunk should be set to False to disable the\r | |
174 | "automatic junk heuristic" that treats popular elements as junk\r | |
175 | (see module documentation for more information).\r | |
176 | """\r | |
177 | \r | |
178 | # Members:\r | |
179 | # a\r | |
180 | # first sequence\r | |
181 | # b\r | |
182 | # second sequence; differences are computed as "what do\r | |
183 | # we need to do to 'a' to change it into 'b'?"\r | |
184 | # b2j\r | |
185 | # for x in b, b2j[x] is a list of the indices (into b)\r | |
186 | # at which x appears; junk elements do not appear\r | |
187 | # fullbcount\r | |
188 | # for x in b, fullbcount[x] == the number of times x\r | |
189 | # appears in b; only materialized if really needed (used\r | |
190 | # only for computing quick_ratio())\r | |
191 | # matching_blocks\r | |
192 | # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];\r | |
193 | # ascending & non-overlapping in i and in j; terminated by\r | |
194 | # a dummy (len(a), len(b), 0) sentinel\r | |
195 | # opcodes\r | |
196 | # a list of (tag, i1, i2, j1, j2) tuples, where tag is\r | |
197 | # one of\r | |
198 | # 'replace' a[i1:i2] should be replaced by b[j1:j2]\r | |
199 | # 'delete' a[i1:i2] should be deleted\r | |
200 | # 'insert' b[j1:j2] should be inserted\r | |
201 | # 'equal' a[i1:i2] == b[j1:j2]\r | |
202 | # isjunk\r | |
203 | # a user-supplied function taking a sequence element and\r | |
204 | # returning true iff the element is "junk" -- this has\r | |
205 | # subtle but helpful effects on the algorithm, which I'll\r | |
206 | # get around to writing up someday <0.9 wink>.\r | |
207 | # DON'T USE! Only __chain_b uses this. Use isbjunk.\r | |
208 | # isbjunk\r | |
209 | # for x in b, isbjunk(x) == isjunk(x) but much faster;\r | |
210 | # it's really the __contains__ method of a hidden dict.\r | |
211 | # DOES NOT WORK for x in a!\r | |
212 | # isbpopular\r | |
213 | # for x in b, isbpopular(x) is true iff b is reasonably long\r | |
214 | # (at least 200 elements) and x accounts for more than 1 + 1% of\r | |
215 | # its elements (when autojunk is enabled).\r | |
216 | # DOES NOT WORK for x in a!\r | |
217 | \r | |
218 | self.isjunk = isjunk\r | |
219 | self.a = self.b = None\r | |
220 | self.autojunk = autojunk\r | |
221 | self.set_seqs(a, b)\r | |
222 | \r | |
223 | def set_seqs(self, a, b):\r | |
224 | """Set the two sequences to be compared.\r | |
225 | \r | |
226 | >>> s = SequenceMatcher()\r | |
227 | >>> s.set_seqs("abcd", "bcde")\r | |
228 | >>> s.ratio()\r | |
229 | 0.75\r | |
230 | """\r | |
231 | \r | |
232 | self.set_seq1(a)\r | |
233 | self.set_seq2(b)\r | |
234 | \r | |
235 | def set_seq1(self, a):\r | |
236 | """Set the first sequence to be compared.\r | |
237 | \r | |
238 | The second sequence to be compared is not changed.\r | |
239 | \r | |
240 | >>> s = SequenceMatcher(None, "abcd", "bcde")\r | |
241 | >>> s.ratio()\r | |
242 | 0.75\r | |
243 | >>> s.set_seq1("bcde")\r | |
244 | >>> s.ratio()\r | |
245 | 1.0\r | |
246 | >>>\r | |
247 | \r | |
248 | SequenceMatcher computes and caches detailed information about the\r | |
249 | second sequence, so if you want to compare one sequence S against\r | |
250 | many sequences, use .set_seq2(S) once and call .set_seq1(x)\r | |
251 | repeatedly for each of the other sequences.\r | |
252 | \r | |
253 | See also set_seqs() and set_seq2().\r | |
254 | """\r | |
255 | \r | |
256 | if a is self.a:\r | |
257 | return\r | |
258 | self.a = a\r | |
259 | self.matching_blocks = self.opcodes = None\r | |
260 | \r | |
261 | def set_seq2(self, b):\r | |
262 | """Set the second sequence to be compared.\r | |
263 | \r | |
264 | The first sequence to be compared is not changed.\r | |
265 | \r | |
266 | >>> s = SequenceMatcher(None, "abcd", "bcde")\r | |
267 | >>> s.ratio()\r | |
268 | 0.75\r | |
269 | >>> s.set_seq2("abcd")\r | |
270 | >>> s.ratio()\r | |
271 | 1.0\r | |
272 | >>>\r | |
273 | \r | |
274 | SequenceMatcher computes and caches detailed information about the\r | |
275 | second sequence, so if you want to compare one sequence S against\r | |
276 | many sequences, use .set_seq2(S) once and call .set_seq1(x)\r | |
277 | repeatedly for each of the other sequences.\r | |
278 | \r | |
279 | See also set_seqs() and set_seq1().\r | |
280 | """\r | |
281 | \r | |
282 | if b is self.b:\r | |
283 | return\r | |
284 | self.b = b\r | |
285 | self.matching_blocks = self.opcodes = None\r | |
286 | self.fullbcount = None\r | |
287 | self.__chain_b()\r | |
288 | \r | |
289 | # For each element x in b, set b2j[x] to a list of the indices in\r | |
290 | # b where x appears; the indices are in increasing order; note that\r | |
291 | # the number of times x appears in b is len(b2j[x]) ...\r | |
292 | # when self.isjunk is defined, junk elements don't show up in this\r | |
293 | # map at all, which stops the central find_longest_match method\r | |
294 | # from starting any matching block at a junk element ...\r | |
295 | # also creates the fast isbjunk function ...\r | |
296 | # b2j also does not contain entries for "popular" elements, meaning\r | |
297 | # elements that account for more than 1 + 1% of the total elements, and\r | |
298 | # when the sequence is reasonably large (>= 200 elements); this can\r | |
299 | # be viewed as an adaptive notion of semi-junk, and yields an enormous\r | |
300 | # speedup when, e.g., comparing program files with hundreds of\r | |
301 | # instances of "return NULL;" ...\r | |
302 | # note that this is only called when b changes; so for cross-product\r | |
303 | # kinds of matches, it's best to call set_seq2 once, then set_seq1\r | |
304 | # repeatedly\r | |
305 | \r | |
306 | def __chain_b(self):\r | |
307 | # Because isjunk is a user-defined (not C) function, and we test\r | |
308 | # for junk a LOT, it's important to minimize the number of calls.\r | |
309 | # Before the tricks described here, __chain_b was by far the most\r | |
310 | # time-consuming routine in the whole module! If anyone sees\r | |
311 | # Jim Roskind, thank him again for profile.py -- I never would\r | |
312 | # have guessed that.\r | |
313 | # The first trick is to build b2j ignoring the possibility\r | |
314 | # of junk. I.e., we don't call isjunk at all yet. Throwing\r | |
315 | # out the junk later is much cheaper than building b2j "right"\r | |
316 | # from the start.\r | |
317 | b = self.b\r | |
318 | self.b2j = b2j = {}\r | |
319 | \r | |
320 | for i, elt in enumerate(b):\r | |
321 | indices = b2j.setdefault(elt, [])\r | |
322 | indices.append(i)\r | |
323 | \r | |
324 | # Purge junk elements\r | |
325 | junk = set()\r | |
326 | isjunk = self.isjunk\r | |
327 | if isjunk:\r | |
328 | for elt in list(b2j.keys()): # using list() since b2j is modified\r | |
329 | if isjunk(elt):\r | |
330 | junk.add(elt)\r | |
331 | del b2j[elt]\r | |
332 | \r | |
333 | # Purge popular elements that are not junk\r | |
334 | popular = set()\r | |
335 | n = len(b)\r | |
336 | if self.autojunk and n >= 200:\r | |
337 | ntest = n // 100 + 1\r | |
338 | for elt, idxs in list(b2j.items()):\r | |
339 | if len(idxs) > ntest:\r | |
340 | popular.add(elt)\r | |
341 | del b2j[elt]\r | |
342 | \r | |
343 | # Now for x in b, isjunk(x) == x in junk, but the latter is much faster.\r | |
344 | # Sicne the number of *unique* junk elements is probably small, the\r | |
345 | # memory burden of keeping this set alive is likely trivial compared to\r | |
346 | # the size of b2j.\r | |
347 | self.isbjunk = junk.__contains__\r | |
348 | self.isbpopular = popular.__contains__\r | |
349 | \r | |
350 | def find_longest_match(self, alo, ahi, blo, bhi):\r | |
351 | """Find longest matching block in a[alo:ahi] and b[blo:bhi].\r | |
352 | \r | |
353 | If isjunk is not defined:\r | |
354 | \r | |
355 | Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where\r | |
356 | alo <= i <= i+k <= ahi\r | |
357 | blo <= j <= j+k <= bhi\r | |
358 | and for all (i',j',k') meeting those conditions,\r | |
359 | k >= k'\r | |
360 | i <= i'\r | |
361 | and if i == i', j <= j'\r | |
362 | \r | |
363 | In other words, of all maximal matching blocks, return one that\r | |
364 | starts earliest in a, and of all those maximal matching blocks that\r | |
365 | start earliest in a, return the one that starts earliest in b.\r | |
366 | \r | |
367 | >>> s = SequenceMatcher(None, " abcd", "abcd abcd")\r | |
368 | >>> s.find_longest_match(0, 5, 0, 9)\r | |
369 | Match(a=0, b=4, size=5)\r | |
370 | \r | |
371 | If isjunk is defined, first the longest matching block is\r | |
372 | determined as above, but with the additional restriction that no\r | |
373 | junk element appears in the block. Then that block is extended as\r | |
374 | far as possible by matching (only) junk elements on both sides. So\r | |
375 | the resulting block never matches on junk except as identical junk\r | |
376 | happens to be adjacent to an "interesting" match.\r | |
377 | \r | |
378 | Here's the same example as before, but considering blanks to be\r | |
379 | junk. That prevents " abcd" from matching the " abcd" at the tail\r | |
380 | end of the second sequence directly. Instead only the "abcd" can\r | |
381 | match, and matches the leftmost "abcd" in the second sequence:\r | |
382 | \r | |
383 | >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")\r | |
384 | >>> s.find_longest_match(0, 5, 0, 9)\r | |
385 | Match(a=1, b=0, size=4)\r | |
386 | \r | |
387 | If no blocks match, return (alo, blo, 0).\r | |
388 | \r | |
389 | >>> s = SequenceMatcher(None, "ab", "c")\r | |
390 | >>> s.find_longest_match(0, 2, 0, 1)\r | |
391 | Match(a=0, b=0, size=0)\r | |
392 | """\r | |
393 | \r | |
394 | # CAUTION: stripping common prefix or suffix would be incorrect.\r | |
395 | # E.g.,\r | |
396 | # ab\r | |
397 | # acab\r | |
398 | # Longest matching block is "ab", but if common prefix is\r | |
399 | # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so\r | |
400 | # strip, so ends up claiming that ab is changed to acab by\r | |
401 | # inserting "ca" in the middle. That's minimal but unintuitive:\r | |
402 | # "it's obvious" that someone inserted "ac" at the front.\r | |
403 | # Windiff ends up at the same place as diff, but by pairing up\r | |
404 | # the unique 'b's and then matching the first two 'a's.\r | |
405 | \r | |
406 | a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk\r | |
407 | besti, bestj, bestsize = alo, blo, 0\r | |
408 | # find longest junk-free match\r | |
409 | # during an iteration of the loop, j2len[j] = length of longest\r | |
410 | # junk-free match ending with a[i-1] and b[j]\r | |
411 | j2len = {}\r | |
412 | nothing = []\r | |
413 | for i in xrange(alo, ahi):\r | |
414 | # look at all instances of a[i] in b; note that because\r | |
415 | # b2j has no junk keys, the loop is skipped if a[i] is junk\r | |
416 | j2lenget = j2len.get\r | |
417 | newj2len = {}\r | |
418 | for j in b2j.get(a[i], nothing):\r | |
419 | # a[i] matches b[j]\r | |
420 | if j < blo:\r | |
421 | continue\r | |
422 | if j >= bhi:\r | |
423 | break\r | |
424 | k = newj2len[j] = j2lenget(j-1, 0) + 1\r | |
425 | if k > bestsize:\r | |
426 | besti, bestj, bestsize = i-k+1, j-k+1, k\r | |
427 | j2len = newj2len\r | |
428 | \r | |
429 | # Extend the best by non-junk elements on each end. In particular,\r | |
430 | # "popular" non-junk elements aren't in b2j, which greatly speeds\r | |
431 | # the inner loop above, but also means "the best" match so far\r | |
432 | # doesn't contain any junk *or* popular non-junk elements.\r | |
433 | while besti > alo and bestj > blo and \\r | |
434 | not isbjunk(b[bestj-1]) and \\r | |
435 | a[besti-1] == b[bestj-1]:\r | |
436 | besti, bestj, bestsize = besti-1, bestj-1, bestsize+1\r | |
437 | while besti+bestsize < ahi and bestj+bestsize < bhi and \\r | |
438 | not isbjunk(b[bestj+bestsize]) and \\r | |
439 | a[besti+bestsize] == b[bestj+bestsize]:\r | |
440 | bestsize += 1\r | |
441 | \r | |
442 | # Now that we have a wholly interesting match (albeit possibly\r | |
443 | # empty!), we may as well suck up the matching junk on each\r | |
444 | # side of it too. Can't think of a good reason not to, and it\r | |
445 | # saves post-processing the (possibly considerable) expense of\r | |
446 | # figuring out what to do with it. In the case of an empty\r | |
447 | # interesting match, this is clearly the right thing to do,\r | |
448 | # because no other kind of match is possible in the regions.\r | |
449 | while besti > alo and bestj > blo and \\r | |
450 | isbjunk(b[bestj-1]) and \\r | |
451 | a[besti-1] == b[bestj-1]:\r | |
452 | besti, bestj, bestsize = besti-1, bestj-1, bestsize+1\r | |
453 | while besti+bestsize < ahi and bestj+bestsize < bhi and \\r | |
454 | isbjunk(b[bestj+bestsize]) and \\r | |
455 | a[besti+bestsize] == b[bestj+bestsize]:\r | |
456 | bestsize = bestsize + 1\r | |
457 | \r | |
458 | return Match(besti, bestj, bestsize)\r | |
459 | \r | |
460 | def get_matching_blocks(self):\r | |
461 | """Return list of triples describing matching subsequences.\r | |
462 | \r | |
463 | Each triple is of the form (i, j, n), and means that\r | |
464 | a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in\r | |
465 | i and in j. New in Python 2.5, it's also guaranteed that if\r | |
466 | (i, j, n) and (i', j', n') are adjacent triples in the list, and\r | |
467 | the second is not the last triple in the list, then i+n != i' or\r | |
468 | j+n != j'. IOW, adjacent triples never describe adjacent equal\r | |
469 | blocks.\r | |
470 | \r | |
471 | The last triple is a dummy, (len(a), len(b), 0), and is the only\r | |
472 | triple with n==0.\r | |
473 | \r | |
474 | >>> s = SequenceMatcher(None, "abxcd", "abcd")\r | |
475 | >>> s.get_matching_blocks()\r | |
476 | [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]\r | |
477 | """\r | |
478 | \r | |
479 | if self.matching_blocks is not None:\r | |
480 | return self.matching_blocks\r | |
481 | la, lb = len(self.a), len(self.b)\r | |
482 | \r | |
483 | # This is most naturally expressed as a recursive algorithm, but\r | |
484 | # at least one user bumped into extreme use cases that exceeded\r | |
485 | # the recursion limit on their box. So, now we maintain a list\r | |
486 | # ('queue`) of blocks we still need to look at, and append partial\r | |
487 | # results to `matching_blocks` in a loop; the matches are sorted\r | |
488 | # at the end.\r | |
489 | queue = [(0, la, 0, lb)]\r | |
490 | matching_blocks = []\r | |
491 | while queue:\r | |
492 | alo, ahi, blo, bhi = queue.pop()\r | |
493 | i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)\r | |
494 | # a[alo:i] vs b[blo:j] unknown\r | |
495 | # a[i:i+k] same as b[j:j+k]\r | |
496 | # a[i+k:ahi] vs b[j+k:bhi] unknown\r | |
497 | if k: # if k is 0, there was no matching block\r | |
498 | matching_blocks.append(x)\r | |
499 | if alo < i and blo < j:\r | |
500 | queue.append((alo, i, blo, j))\r | |
501 | if i+k < ahi and j+k < bhi:\r | |
502 | queue.append((i+k, ahi, j+k, bhi))\r | |
503 | matching_blocks.sort()\r | |
504 | \r | |
505 | # It's possible that we have adjacent equal blocks in the\r | |
506 | # matching_blocks list now. Starting with 2.5, this code was added\r | |
507 | # to collapse them.\r | |
508 | i1 = j1 = k1 = 0\r | |
509 | non_adjacent = []\r | |
510 | for i2, j2, k2 in matching_blocks:\r | |
511 | # Is this block adjacent to i1, j1, k1?\r | |
512 | if i1 + k1 == i2 and j1 + k1 == j2:\r | |
513 | # Yes, so collapse them -- this just increases the length of\r | |
514 | # the first block by the length of the second, and the first\r | |
515 | # block so lengthened remains the block to compare against.\r | |
516 | k1 += k2\r | |
517 | else:\r | |
518 | # Not adjacent. Remember the first block (k1==0 means it's\r | |
519 | # the dummy we started with), and make the second block the\r | |
520 | # new block to compare against.\r | |
521 | if k1:\r | |
522 | non_adjacent.append((i1, j1, k1))\r | |
523 | i1, j1, k1 = i2, j2, k2\r | |
524 | if k1:\r | |
525 | non_adjacent.append((i1, j1, k1))\r | |
526 | \r | |
527 | non_adjacent.append( (la, lb, 0) )\r | |
528 | self.matching_blocks = non_adjacent\r | |
529 | return map(Match._make, self.matching_blocks)\r | |
530 | \r | |
531 | def get_opcodes(self):\r | |
532 | """Return list of 5-tuples describing how to turn a into b.\r | |
533 | \r | |
534 | Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple\r | |
535 | has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the\r | |
536 | tuple preceding it, and likewise for j1 == the previous j2.\r | |
537 | \r | |
538 | The tags are strings, with these meanings:\r | |
539 | \r | |
540 | 'replace': a[i1:i2] should be replaced by b[j1:j2]\r | |
541 | 'delete': a[i1:i2] should be deleted.\r | |
542 | Note that j1==j2 in this case.\r | |
543 | 'insert': b[j1:j2] should be inserted at a[i1:i1].\r | |
544 | Note that i1==i2 in this case.\r | |
545 | 'equal': a[i1:i2] == b[j1:j2]\r | |
546 | \r | |
547 | >>> a = "qabxcd"\r | |
548 | >>> b = "abycdf"\r | |
549 | >>> s = SequenceMatcher(None, a, b)\r | |
550 | >>> for tag, i1, i2, j1, j2 in s.get_opcodes():\r | |
551 | ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %\r | |
552 | ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))\r | |
553 | delete a[0:1] (q) b[0:0] ()\r | |
554 | equal a[1:3] (ab) b[0:2] (ab)\r | |
555 | replace a[3:4] (x) b[2:3] (y)\r | |
556 | equal a[4:6] (cd) b[3:5] (cd)\r | |
557 | insert a[6:6] () b[5:6] (f)\r | |
558 | """\r | |
559 | \r | |
560 | if self.opcodes is not None:\r | |
561 | return self.opcodes\r | |
562 | i = j = 0\r | |
563 | self.opcodes = answer = []\r | |
564 | for ai, bj, size in self.get_matching_blocks():\r | |
565 | # invariant: we've pumped out correct diffs to change\r | |
566 | # a[:i] into b[:j], and the next matching block is\r | |
567 | # a[ai:ai+size] == b[bj:bj+size]. So we need to pump\r | |
568 | # out a diff to change a[i:ai] into b[j:bj], pump out\r | |
569 | # the matching block, and move (i,j) beyond the match\r | |
570 | tag = ''\r | |
571 | if i < ai and j < bj:\r | |
572 | tag = 'replace'\r | |
573 | elif i < ai:\r | |
574 | tag = 'delete'\r | |
575 | elif j < bj:\r | |
576 | tag = 'insert'\r | |
577 | if tag:\r | |
578 | answer.append( (tag, i, ai, j, bj) )\r | |
579 | i, j = ai+size, bj+size\r | |
580 | # the list of matching blocks is terminated by a\r | |
581 | # sentinel with size 0\r | |
582 | if size:\r | |
583 | answer.append( ('equal', ai, i, bj, j) )\r | |
584 | return answer\r | |
585 | \r | |
586 | def get_grouped_opcodes(self, n=3):\r | |
587 | """ Isolate change clusters by eliminating ranges with no changes.\r | |
588 | \r | |
589 | Return a generator of groups with upto n lines of context.\r | |
590 | Each group is in the same format as returned by get_opcodes().\r | |
591 | \r | |
592 | >>> from pprint import pprint\r | |
593 | >>> a = map(str, range(1,40))\r | |
594 | >>> b = a[:]\r | |
595 | >>> b[8:8] = ['i'] # Make an insertion\r | |
596 | >>> b[20] += 'x' # Make a replacement\r | |
597 | >>> b[23:28] = [] # Make a deletion\r | |
598 | >>> b[30] += 'y' # Make another replacement\r | |
599 | >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))\r | |
600 | [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],\r | |
601 | [('equal', 16, 19, 17, 20),\r | |
602 | ('replace', 19, 20, 20, 21),\r | |
603 | ('equal', 20, 22, 21, 23),\r | |
604 | ('delete', 22, 27, 23, 23),\r | |
605 | ('equal', 27, 30, 23, 26)],\r | |
606 | [('equal', 31, 34, 27, 30),\r | |
607 | ('replace', 34, 35, 30, 31),\r | |
608 | ('equal', 35, 38, 31, 34)]]\r | |
609 | """\r | |
610 | \r | |
611 | codes = self.get_opcodes()\r | |
612 | if not codes:\r | |
613 | codes = [("equal", 0, 1, 0, 1)]\r | |
614 | # Fixup leading and trailing groups if they show no changes.\r | |
615 | if codes[0][0] == 'equal':\r | |
616 | tag, i1, i2, j1, j2 = codes[0]\r | |
617 | codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2\r | |
618 | if codes[-1][0] == 'equal':\r | |
619 | tag, i1, i2, j1, j2 = codes[-1]\r | |
620 | codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)\r | |
621 | \r | |
622 | nn = n + n\r | |
623 | group = []\r | |
624 | for tag, i1, i2, j1, j2 in codes:\r | |
625 | # End the current group and start a new one whenever\r | |
626 | # there is a large range with no changes.\r | |
627 | if tag == 'equal' and i2-i1 > nn:\r | |
628 | group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))\r | |
629 | yield group\r | |
630 | group = []\r | |
631 | i1, j1 = max(i1, i2-n), max(j1, j2-n)\r | |
632 | group.append((tag, i1, i2, j1 ,j2))\r | |
633 | if group and not (len(group)==1 and group[0][0] == 'equal'):\r | |
634 | yield group\r | |
635 | \r | |
636 | def ratio(self):\r | |
637 | """Return a measure of the sequences' similarity (float in [0,1]).\r | |
638 | \r | |
639 | Where T is the total number of elements in both sequences, and\r | |
640 | M is the number of matches, this is 2.0*M / T.\r | |
641 | Note that this is 1 if the sequences are identical, and 0 if\r | |
642 | they have nothing in common.\r | |
643 | \r | |
644 | .ratio() is expensive to compute if you haven't already computed\r | |
645 | .get_matching_blocks() or .get_opcodes(), in which case you may\r | |
646 | want to try .quick_ratio() or .real_quick_ratio() first to get an\r | |
647 | upper bound.\r | |
648 | \r | |
649 | >>> s = SequenceMatcher(None, "abcd", "bcde")\r | |
650 | >>> s.ratio()\r | |
651 | 0.75\r | |
652 | >>> s.quick_ratio()\r | |
653 | 0.75\r | |
654 | >>> s.real_quick_ratio()\r | |
655 | 1.0\r | |
656 | """\r | |
657 | \r | |
658 | matches = reduce(lambda sum, triple: sum + triple[-1],\r | |
659 | self.get_matching_blocks(), 0)\r | |
660 | return _calculate_ratio(matches, len(self.a) + len(self.b))\r | |
661 | \r | |
662 | def quick_ratio(self):\r | |
663 | """Return an upper bound on ratio() relatively quickly.\r | |
664 | \r | |
665 | This isn't defined beyond that it is an upper bound on .ratio(), and\r | |
666 | is faster to compute.\r | |
667 | """\r | |
668 | \r | |
669 | # viewing a and b as multisets, set matches to the cardinality\r | |
670 | # of their intersection; this counts the number of matches\r | |
671 | # without regard to order, so is clearly an upper bound\r | |
672 | if self.fullbcount is None:\r | |
673 | self.fullbcount = fullbcount = {}\r | |
674 | for elt in self.b:\r | |
675 | fullbcount[elt] = fullbcount.get(elt, 0) + 1\r | |
676 | fullbcount = self.fullbcount\r | |
677 | # avail[x] is the number of times x appears in 'b' less the\r | |
678 | # number of times we've seen it in 'a' so far ... kinda\r | |
679 | avail = {}\r | |
680 | availhas, matches = avail.__contains__, 0\r | |
681 | for elt in self.a:\r | |
682 | if availhas(elt):\r | |
683 | numb = avail[elt]\r | |
684 | else:\r | |
685 | numb = fullbcount.get(elt, 0)\r | |
686 | avail[elt] = numb - 1\r | |
687 | if numb > 0:\r | |
688 | matches = matches + 1\r | |
689 | return _calculate_ratio(matches, len(self.a) + len(self.b))\r | |
690 | \r | |
691 | def real_quick_ratio(self):\r | |
692 | """Return an upper bound on ratio() very quickly.\r | |
693 | \r | |
694 | This isn't defined beyond that it is an upper bound on .ratio(), and\r | |
695 | is faster to compute than either .ratio() or .quick_ratio().\r | |
696 | """\r | |
697 | \r | |
698 | la, lb = len(self.a), len(self.b)\r | |
699 | # can't have more matches than the number of elements in the\r | |
700 | # shorter sequence\r | |
701 | return _calculate_ratio(min(la, lb), la + lb)\r | |
702 | \r | |
703 | def get_close_matches(word, possibilities, n=3, cutoff=0.6):\r | |
704 | """Use SequenceMatcher to return list of the best "good enough" matches.\r | |
705 | \r | |
706 | word is a sequence for which close matches are desired (typically a\r | |
707 | string).\r | |
708 | \r | |
709 | possibilities is a list of sequences against which to match word\r | |
710 | (typically a list of strings).\r | |
711 | \r | |
712 | Optional arg n (default 3) is the maximum number of close matches to\r | |
713 | return. n must be > 0.\r | |
714 | \r | |
715 | Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities\r | |
716 | that don't score at least that similar to word are ignored.\r | |
717 | \r | |
718 | The best (no more than n) matches among the possibilities are returned\r | |
719 | in a list, sorted by similarity score, most similar first.\r | |
720 | \r | |
721 | >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])\r | |
722 | ['apple', 'ape']\r | |
723 | >>> import keyword as _keyword\r | |
724 | >>> get_close_matches("wheel", _keyword.kwlist)\r | |
725 | ['while']\r | |
726 | >>> get_close_matches("apple", _keyword.kwlist)\r | |
727 | []\r | |
728 | >>> get_close_matches("accept", _keyword.kwlist)\r | |
729 | ['except']\r | |
730 | """\r | |
731 | \r | |
732 | if not n > 0:\r | |
733 | raise ValueError("n must be > 0: %r" % (n,))\r | |
734 | if not 0.0 <= cutoff <= 1.0:\r | |
735 | raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))\r | |
736 | result = []\r | |
737 | s = SequenceMatcher()\r | |
738 | s.set_seq2(word)\r | |
739 | for x in possibilities:\r | |
740 | s.set_seq1(x)\r | |
741 | if s.real_quick_ratio() >= cutoff and \\r | |
742 | s.quick_ratio() >= cutoff and \\r | |
743 | s.ratio() >= cutoff:\r | |
744 | result.append((s.ratio(), x))\r | |
745 | \r | |
746 | # Move the best scorers to head of list\r | |
747 | result = heapq.nlargest(n, result)\r | |
748 | # Strip scores for the best n matches\r | |
749 | return [x for score, x in result]\r | |
750 | \r | |
751 | def _count_leading(line, ch):\r | |
752 | """\r | |
753 | Return number of `ch` characters at the start of `line`.\r | |
754 | \r | |
755 | Example:\r | |
756 | \r | |
757 | >>> _count_leading(' abc', ' ')\r | |
758 | 3\r | |
759 | """\r | |
760 | \r | |
761 | i, n = 0, len(line)\r | |
762 | while i < n and line[i] == ch:\r | |
763 | i += 1\r | |
764 | return i\r | |
765 | \r | |
766 | class Differ:\r | |
767 | r"""\r | |
768 | Differ is a class for comparing sequences of lines of text, and\r | |
769 | producing human-readable differences or deltas. Differ uses\r | |
770 | SequenceMatcher both to compare sequences of lines, and to compare\r | |
771 | sequences of characters within similar (near-matching) lines.\r | |
772 | \r | |
773 | Each line of a Differ delta begins with a two-letter code:\r | |
774 | \r | |
775 | '- ' line unique to sequence 1\r | |
776 | '+ ' line unique to sequence 2\r | |
777 | ' ' line common to both sequences\r | |
778 | '? ' line not present in either input sequence\r | |
779 | \r | |
780 | Lines beginning with '? ' attempt to guide the eye to intraline\r | |
781 | differences, and were not present in either input sequence. These lines\r | |
782 | can be confusing if the sequences contain tab characters.\r | |
783 | \r | |
784 | Note that Differ makes no claim to produce a *minimal* diff. To the\r | |
785 | contrary, minimal diffs are often counter-intuitive, because they synch\r | |
786 | up anywhere possible, sometimes accidental matches 100 pages apart.\r | |
787 | Restricting synch points to contiguous matches preserves some notion of\r | |
788 | locality, at the occasional cost of producing a longer diff.\r | |
789 | \r | |
790 | Example: Comparing two texts.\r | |
791 | \r | |
792 | First we set up the texts, sequences of individual single-line strings\r | |
793 | ending with newlines (such sequences can also be obtained from the\r | |
794 | `readlines()` method of file-like objects):\r | |
795 | \r | |
796 | >>> text1 = ''' 1. Beautiful is better than ugly.\r | |
797 | ... 2. Explicit is better than implicit.\r | |
798 | ... 3. Simple is better than complex.\r | |
799 | ... 4. Complex is better than complicated.\r | |
800 | ... '''.splitlines(1)\r | |
801 | >>> len(text1)\r | |
802 | 4\r | |
803 | >>> text1[0][-1]\r | |
804 | '\n'\r | |
805 | >>> text2 = ''' 1. Beautiful is better than ugly.\r | |
806 | ... 3. Simple is better than complex.\r | |
807 | ... 4. Complicated is better than complex.\r | |
808 | ... 5. Flat is better than nested.\r | |
809 | ... '''.splitlines(1)\r | |
810 | \r | |
811 | Next we instantiate a Differ object:\r | |
812 | \r | |
813 | >>> d = Differ()\r | |
814 | \r | |
815 | Note that when instantiating a Differ object we may pass functions to\r | |
816 | filter out line and character 'junk'. See Differ.__init__ for details.\r | |
817 | \r | |
818 | Finally, we compare the two:\r | |
819 | \r | |
820 | >>> result = list(d.compare(text1, text2))\r | |
821 | \r | |
822 | 'result' is a list of strings, so let's pretty-print it:\r | |
823 | \r | |
824 | >>> from pprint import pprint as _pprint\r | |
825 | >>> _pprint(result)\r | |
826 | [' 1. Beautiful is better than ugly.\n',\r | |
827 | '- 2. Explicit is better than implicit.\n',\r | |
828 | '- 3. Simple is better than complex.\n',\r | |
829 | '+ 3. Simple is better than complex.\n',\r | |
830 | '? ++\n',\r | |
831 | '- 4. Complex is better than complicated.\n',\r | |
832 | '? ^ ---- ^\n',\r | |
833 | '+ 4. Complicated is better than complex.\n',\r | |
834 | '? ++++ ^ ^\n',\r | |
835 | '+ 5. Flat is better than nested.\n']\r | |
836 | \r | |
837 | As a single multi-line string it looks like this:\r | |
838 | \r | |
839 | >>> print ''.join(result),\r | |
840 | 1. Beautiful is better than ugly.\r | |
841 | - 2. Explicit is better than implicit.\r | |
842 | - 3. Simple is better than complex.\r | |
843 | + 3. Simple is better than complex.\r | |
844 | ? ++\r | |
845 | - 4. Complex is better than complicated.\r | |
846 | ? ^ ---- ^\r | |
847 | + 4. Complicated is better than complex.\r | |
848 | ? ++++ ^ ^\r | |
849 | + 5. Flat is better than nested.\r | |
850 | \r | |
851 | Methods:\r | |
852 | \r | |
853 | __init__(linejunk=None, charjunk=None)\r | |
854 | Construct a text differencer, with optional filters.\r | |
855 | \r | |
856 | compare(a, b)\r | |
857 | Compare two sequences of lines; generate the resulting delta.\r | |
858 | """\r | |
859 | \r | |
860 | def __init__(self, linejunk=None, charjunk=None):\r | |
861 | """\r | |
862 | Construct a text differencer, with optional filters.\r | |
863 | \r | |
864 | The two optional keyword parameters are for filter functions:\r | |
865 | \r | |
866 | - `linejunk`: A function that should accept a single string argument,\r | |
867 | and return true iff the string is junk. The module-level function\r | |
868 | `IS_LINE_JUNK` may be used to filter out lines without visible\r | |
869 | characters, except for at most one splat ('#'). It is recommended\r | |
870 | to leave linejunk None; as of Python 2.3, the underlying\r | |
871 | SequenceMatcher class has grown an adaptive notion of "noise" lines\r | |
872 | that's better than any static definition the author has ever been\r | |
873 | able to craft.\r | |
874 | \r | |
875 | - `charjunk`: A function that should accept a string of length 1. The\r | |
876 | module-level function `IS_CHARACTER_JUNK` may be used to filter out\r | |
877 | whitespace characters (a blank or tab; **note**: bad idea to include\r | |
878 | newline in this!). Use of IS_CHARACTER_JUNK is recommended.\r | |
879 | """\r | |
880 | \r | |
881 | self.linejunk = linejunk\r | |
882 | self.charjunk = charjunk\r | |
883 | \r | |
884 | def compare(self, a, b):\r | |
885 | r"""\r | |
886 | Compare two sequences of lines; generate the resulting delta.\r | |
887 | \r | |
888 | Each sequence must contain individual single-line strings ending with\r | |
889 | newlines. Such sequences can be obtained from the `readlines()` method\r | |
890 | of file-like objects. The delta generated also consists of newline-\r | |
891 | terminated strings, ready to be printed as-is via the writeline()\r | |
892 | method of a file-like object.\r | |
893 | \r | |
894 | Example:\r | |
895 | \r | |
896 | >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),\r | |
897 | ... 'ore\ntree\nemu\n'.splitlines(1))),\r | |
898 | - one\r | |
899 | ? ^\r | |
900 | + ore\r | |
901 | ? ^\r | |
902 | - two\r | |
903 | - three\r | |
904 | ? -\r | |
905 | + tree\r | |
906 | + emu\r | |
907 | """\r | |
908 | \r | |
909 | cruncher = SequenceMatcher(self.linejunk, a, b)\r | |
910 | for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():\r | |
911 | if tag == 'replace':\r | |
912 | g = self._fancy_replace(a, alo, ahi, b, blo, bhi)\r | |
913 | elif tag == 'delete':\r | |
914 | g = self._dump('-', a, alo, ahi)\r | |
915 | elif tag == 'insert':\r | |
916 | g = self._dump('+', b, blo, bhi)\r | |
917 | elif tag == 'equal':\r | |
918 | g = self._dump(' ', a, alo, ahi)\r | |
919 | else:\r | |
920 | raise ValueError, 'unknown tag %r' % (tag,)\r | |
921 | \r | |
922 | for line in g:\r | |
923 | yield line\r | |
924 | \r | |
925 | def _dump(self, tag, x, lo, hi):\r | |
926 | """Generate comparison results for a same-tagged range."""\r | |
927 | for i in xrange(lo, hi):\r | |
928 | yield '%s %s' % (tag, x[i])\r | |
929 | \r | |
930 | def _plain_replace(self, a, alo, ahi, b, blo, bhi):\r | |
931 | assert alo < ahi and blo < bhi\r | |
932 | # dump the shorter block first -- reduces the burden on short-term\r | |
933 | # memory if the blocks are of very different sizes\r | |
934 | if bhi - blo < ahi - alo:\r | |
935 | first = self._dump('+', b, blo, bhi)\r | |
936 | second = self._dump('-', a, alo, ahi)\r | |
937 | else:\r | |
938 | first = self._dump('-', a, alo, ahi)\r | |
939 | second = self._dump('+', b, blo, bhi)\r | |
940 | \r | |
941 | for g in first, second:\r | |
942 | for line in g:\r | |
943 | yield line\r | |
944 | \r | |
945 | def _fancy_replace(self, a, alo, ahi, b, blo, bhi):\r | |
946 | r"""\r | |
947 | When replacing one block of lines with another, search the blocks\r | |
948 | for *similar* lines; the best-matching pair (if any) is used as a\r | |
949 | synch point, and intraline difference marking is done on the\r | |
950 | similar pair. Lots of work, but often worth it.\r | |
951 | \r | |
952 | Example:\r | |
953 | \r | |
954 | >>> d = Differ()\r | |
955 | >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,\r | |
956 | ... ['abcdefGhijkl\n'], 0, 1)\r | |
957 | >>> print ''.join(results),\r | |
958 | - abcDefghiJkl\r | |
959 | ? ^ ^ ^\r | |
960 | + abcdefGhijkl\r | |
961 | ? ^ ^ ^\r | |
962 | """\r | |
963 | \r | |
964 | # don't synch up unless the lines have a similarity score of at\r | |
965 | # least cutoff; best_ratio tracks the best score seen so far\r | |
966 | best_ratio, cutoff = 0.74, 0.75\r | |
967 | cruncher = SequenceMatcher(self.charjunk)\r | |
968 | eqi, eqj = None, None # 1st indices of equal lines (if any)\r | |
969 | \r | |
970 | # search for the pair that matches best without being identical\r | |
971 | # (identical lines must be junk lines, & we don't want to synch up\r | |
972 | # on junk -- unless we have to)\r | |
973 | for j in xrange(blo, bhi):\r | |
974 | bj = b[j]\r | |
975 | cruncher.set_seq2(bj)\r | |
976 | for i in xrange(alo, ahi):\r | |
977 | ai = a[i]\r | |
978 | if ai == bj:\r | |
979 | if eqi is None:\r | |
980 | eqi, eqj = i, j\r | |
981 | continue\r | |
982 | cruncher.set_seq1(ai)\r | |
983 | # computing similarity is expensive, so use the quick\r | |
984 | # upper bounds first -- have seen this speed up messy\r | |
985 | # compares by a factor of 3.\r | |
986 | # note that ratio() is only expensive to compute the first\r | |
987 | # time it's called on a sequence pair; the expensive part\r | |
988 | # of the computation is cached by cruncher\r | |
989 | if cruncher.real_quick_ratio() > best_ratio and \\r | |
990 | cruncher.quick_ratio() > best_ratio and \\r | |
991 | cruncher.ratio() > best_ratio:\r | |
992 | best_ratio, best_i, best_j = cruncher.ratio(), i, j\r | |
993 | if best_ratio < cutoff:\r | |
994 | # no non-identical "pretty close" pair\r | |
995 | if eqi is None:\r | |
996 | # no identical pair either -- treat it as a straight replace\r | |
997 | for line in self._plain_replace(a, alo, ahi, b, blo, bhi):\r | |
998 | yield line\r | |
999 | return\r | |
1000 | # no close pair, but an identical pair -- synch up on that\r | |
1001 | best_i, best_j, best_ratio = eqi, eqj, 1.0\r | |
1002 | else:\r | |
1003 | # there's a close pair, so forget the identical pair (if any)\r | |
1004 | eqi = None\r | |
1005 | \r | |
1006 | # a[best_i] very similar to b[best_j]; eqi is None iff they're not\r | |
1007 | # identical\r | |
1008 | \r | |
1009 | # pump out diffs from before the synch point\r | |
1010 | for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):\r | |
1011 | yield line\r | |
1012 | \r | |
1013 | # do intraline marking on the synch pair\r | |
1014 | aelt, belt = a[best_i], b[best_j]\r | |
1015 | if eqi is None:\r | |
1016 | # pump out a '-', '?', '+', '?' quad for the synched lines\r | |
1017 | atags = btags = ""\r | |
1018 | cruncher.set_seqs(aelt, belt)\r | |
1019 | for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():\r | |
1020 | la, lb = ai2 - ai1, bj2 - bj1\r | |
1021 | if tag == 'replace':\r | |
1022 | atags += '^' * la\r | |
1023 | btags += '^' * lb\r | |
1024 | elif tag == 'delete':\r | |
1025 | atags += '-' * la\r | |
1026 | elif tag == 'insert':\r | |
1027 | btags += '+' * lb\r | |
1028 | elif tag == 'equal':\r | |
1029 | atags += ' ' * la\r | |
1030 | btags += ' ' * lb\r | |
1031 | else:\r | |
1032 | raise ValueError, 'unknown tag %r' % (tag,)\r | |
1033 | for line in self._qformat(aelt, belt, atags, btags):\r | |
1034 | yield line\r | |
1035 | else:\r | |
1036 | # the synch pair is identical\r | |
1037 | yield ' ' + aelt\r | |
1038 | \r | |
1039 | # pump out diffs from after the synch point\r | |
1040 | for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):\r | |
1041 | yield line\r | |
1042 | \r | |
1043 | def _fancy_helper(self, a, alo, ahi, b, blo, bhi):\r | |
1044 | g = []\r | |
1045 | if alo < ahi:\r | |
1046 | if blo < bhi:\r | |
1047 | g = self._fancy_replace(a, alo, ahi, b, blo, bhi)\r | |
1048 | else:\r | |
1049 | g = self._dump('-', a, alo, ahi)\r | |
1050 | elif blo < bhi:\r | |
1051 | g = self._dump('+', b, blo, bhi)\r | |
1052 | \r | |
1053 | for line in g:\r | |
1054 | yield line\r | |
1055 | \r | |
1056 | def _qformat(self, aline, bline, atags, btags):\r | |
1057 | r"""\r | |
1058 | Format "?" output and deal with leading tabs.\r | |
1059 | \r | |
1060 | Example:\r | |
1061 | \r | |
1062 | >>> d = Differ()\r | |
1063 | >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',\r | |
1064 | ... ' ^ ^ ^ ', ' ^ ^ ^ ')\r | |
1065 | >>> for line in results: print repr(line)\r | |
1066 | ...\r | |
1067 | '- \tabcDefghiJkl\n'\r | |
1068 | '? \t ^ ^ ^\n'\r | |
1069 | '+ \tabcdefGhijkl\n'\r | |
1070 | '? \t ^ ^ ^\n'\r | |
1071 | """\r | |
1072 | \r | |
1073 | # Can hurt, but will probably help most of the time.\r | |
1074 | common = min(_count_leading(aline, "\t"),\r | |
1075 | _count_leading(bline, "\t"))\r | |
1076 | common = min(common, _count_leading(atags[:common], " "))\r | |
1077 | common = min(common, _count_leading(btags[:common], " "))\r | |
1078 | atags = atags[common:].rstrip()\r | |
1079 | btags = btags[common:].rstrip()\r | |
1080 | \r | |
1081 | yield "- " + aline\r | |
1082 | if atags:\r | |
1083 | yield "? %s%s\n" % ("\t" * common, atags)\r | |
1084 | \r | |
1085 | yield "+ " + bline\r | |
1086 | if btags:\r | |
1087 | yield "? %s%s\n" % ("\t" * common, btags)\r | |
1088 | \r | |
1089 | # With respect to junk, an earlier version of ndiff simply refused to\r | |
1090 | # *start* a match with a junk element. The result was cases like this:\r | |
1091 | # before: private Thread currentThread;\r | |
1092 | # after: private volatile Thread currentThread;\r | |
1093 | # If you consider whitespace to be junk, the longest contiguous match\r | |
1094 | # not starting with junk is "e Thread currentThread". So ndiff reported\r | |
1095 | # that "e volatil" was inserted between the 't' and the 'e' in "private".\r | |
1096 | # While an accurate view, to people that's absurd. The current version\r | |
1097 | # looks for matching blocks that are entirely junk-free, then extends the\r | |
1098 | # longest one of those as far as possible but only with matching junk.\r | |
1099 | # So now "currentThread" is matched, then extended to suck up the\r | |
1100 | # preceding blank; then "private" is matched, and extended to suck up the\r | |
1101 | # following blank; then "Thread" is matched; and finally ndiff reports\r | |
1102 | # that "volatile " was inserted before "Thread". The only quibble\r | |
1103 | # remaining is that perhaps it was really the case that " volatile"\r | |
1104 | # was inserted after "private". I can live with that <wink>.\r | |
1105 | \r | |
1106 | import re\r | |
1107 | \r | |
1108 | def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):\r | |
1109 | r"""\r | |
1110 | Return 1 for ignorable line: iff `line` is blank or contains a single '#'.\r | |
1111 | \r | |
1112 | Examples:\r | |
1113 | \r | |
1114 | >>> IS_LINE_JUNK('\n')\r | |
1115 | True\r | |
1116 | >>> IS_LINE_JUNK(' # \n')\r | |
1117 | True\r | |
1118 | >>> IS_LINE_JUNK('hello\n')\r | |
1119 | False\r | |
1120 | """\r | |
1121 | \r | |
1122 | return pat(line) is not None\r | |
1123 | \r | |
1124 | def IS_CHARACTER_JUNK(ch, ws=" \t"):\r | |
1125 | r"""\r | |
1126 | Return 1 for ignorable character: iff `ch` is a space or tab.\r | |
1127 | \r | |
1128 | Examples:\r | |
1129 | \r | |
1130 | >>> IS_CHARACTER_JUNK(' ')\r | |
1131 | True\r | |
1132 | >>> IS_CHARACTER_JUNK('\t')\r | |
1133 | True\r | |
1134 | >>> IS_CHARACTER_JUNK('\n')\r | |
1135 | False\r | |
1136 | >>> IS_CHARACTER_JUNK('x')\r | |
1137 | False\r | |
1138 | """\r | |
1139 | \r | |
1140 | return ch in ws\r | |
1141 | \r | |
1142 | \r | |
1143 | ########################################################################\r | |
1144 | ### Unified Diff\r | |
1145 | ########################################################################\r | |
1146 | \r | |
1147 | def _format_range_unified(start, stop):\r | |
1148 | 'Convert range to the "ed" format'\r | |
1149 | # Per the diff spec at http://www.unix.org/single_unix_specification/\r | |
1150 | beginning = start + 1 # lines start numbering with one\r | |
1151 | length = stop - start\r | |
1152 | if length == 1:\r | |
1153 | return '{}'.format(beginning)\r | |
1154 | if not length:\r | |
1155 | beginning -= 1 # empty ranges begin at line just before the range\r | |
1156 | return '{},{}'.format(beginning, length)\r | |
1157 | \r | |
1158 | def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',\r | |
1159 | tofiledate='', n=3, lineterm='\n'):\r | |
1160 | r"""\r | |
1161 | Compare two sequences of lines; generate the delta as a unified diff.\r | |
1162 | \r | |
1163 | Unified diffs are a compact way of showing line changes and a few\r | |
1164 | lines of context. The number of context lines is set by 'n' which\r | |
1165 | defaults to three.\r | |
1166 | \r | |
1167 | By default, the diff control lines (those with ---, +++, or @@) are\r | |
1168 | created with a trailing newline. This is helpful so that inputs\r | |
1169 | created from file.readlines() result in diffs that are suitable for\r | |
1170 | file.writelines() since both the inputs and outputs have trailing\r | |
1171 | newlines.\r | |
1172 | \r | |
1173 | For inputs that do not have trailing newlines, set the lineterm\r | |
1174 | argument to "" so that the output will be uniformly newline free.\r | |
1175 | \r | |
1176 | The unidiff format normally has a header for filenames and modification\r | |
1177 | times. Any or all of these may be specified using strings for\r | |
1178 | 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.\r | |
1179 | The modification times are normally expressed in the ISO 8601 format.\r | |
1180 | \r | |
1181 | Example:\r | |
1182 | \r | |
1183 | >>> for line in unified_diff('one two three four'.split(),\r | |
1184 | ... 'zero one tree four'.split(), 'Original', 'Current',\r | |
1185 | ... '2005-01-26 23:30:50', '2010-04-02 10:20:52',\r | |
1186 | ... lineterm=''):\r | |
1187 | ... print line # doctest: +NORMALIZE_WHITESPACE\r | |
1188 | --- Original 2005-01-26 23:30:50\r | |
1189 | +++ Current 2010-04-02 10:20:52\r | |
1190 | @@ -1,4 +1,4 @@\r | |
1191 | +zero\r | |
1192 | one\r | |
1193 | -two\r | |
1194 | -three\r | |
1195 | +tree\r | |
1196 | four\r | |
1197 | """\r | |
1198 | \r | |
1199 | started = False\r | |
1200 | for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):\r | |
1201 | if not started:\r | |
1202 | started = True\r | |
1203 | fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''\r | |
1204 | todate = '\t{}'.format(tofiledate) if tofiledate else ''\r | |
1205 | yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)\r | |
1206 | yield '+++ {}{}{}'.format(tofile, todate, lineterm)\r | |
1207 | \r | |
1208 | first, last = group[0], group[-1]\r | |
1209 | file1_range = _format_range_unified(first[1], last[2])\r | |
1210 | file2_range = _format_range_unified(first[3], last[4])\r | |
1211 | yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)\r | |
1212 | \r | |
1213 | for tag, i1, i2, j1, j2 in group:\r | |
1214 | if tag == 'equal':\r | |
1215 | for line in a[i1:i2]:\r | |
1216 | yield ' ' + line\r | |
1217 | continue\r | |
1218 | if tag in ('replace', 'delete'):\r | |
1219 | for line in a[i1:i2]:\r | |
1220 | yield '-' + line\r | |
1221 | if tag in ('replace', 'insert'):\r | |
1222 | for line in b[j1:j2]:\r | |
1223 | yield '+' + line\r | |
1224 | \r | |
1225 | \r | |
1226 | ########################################################################\r | |
1227 | ### Context Diff\r | |
1228 | ########################################################################\r | |
1229 | \r | |
1230 | def _format_range_context(start, stop):\r | |
1231 | 'Convert range to the "ed" format'\r | |
1232 | # Per the diff spec at http://www.unix.org/single_unix_specification/\r | |
1233 | beginning = start + 1 # lines start numbering with one\r | |
1234 | length = stop - start\r | |
1235 | if not length:\r | |
1236 | beginning -= 1 # empty ranges begin at line just before the range\r | |
1237 | if length <= 1:\r | |
1238 | return '{}'.format(beginning)\r | |
1239 | return '{},{}'.format(beginning, beginning + length - 1)\r | |
1240 | \r | |
1241 | # See http://www.unix.org/single_unix_specification/\r | |
1242 | def context_diff(a, b, fromfile='', tofile='',\r | |
1243 | fromfiledate='', tofiledate='', n=3, lineterm='\n'):\r | |
1244 | r"""\r | |
1245 | Compare two sequences of lines; generate the delta as a context diff.\r | |
1246 | \r | |
1247 | Context diffs are a compact way of showing line changes and a few\r | |
1248 | lines of context. The number of context lines is set by 'n' which\r | |
1249 | defaults to three.\r | |
1250 | \r | |
1251 | By default, the diff control lines (those with *** or ---) are\r | |
1252 | created with a trailing newline. This is helpful so that inputs\r | |
1253 | created from file.readlines() result in diffs that are suitable for\r | |
1254 | file.writelines() since both the inputs and outputs have trailing\r | |
1255 | newlines.\r | |
1256 | \r | |
1257 | For inputs that do not have trailing newlines, set the lineterm\r | |
1258 | argument to "" so that the output will be uniformly newline free.\r | |
1259 | \r | |
1260 | The context diff format normally has a header for filenames and\r | |
1261 | modification times. Any or all of these may be specified using\r | |
1262 | strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.\r | |
1263 | The modification times are normally expressed in the ISO 8601 format.\r | |
1264 | If not specified, the strings default to blanks.\r | |
1265 | \r | |
1266 | Example:\r | |
1267 | \r | |
1268 | >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),\r | |
1269 | ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')),\r | |
1270 | *** Original\r | |
1271 | --- Current\r | |
1272 | ***************\r | |
1273 | *** 1,4 ****\r | |
1274 | one\r | |
1275 | ! two\r | |
1276 | ! three\r | |
1277 | four\r | |
1278 | --- 1,4 ----\r | |
1279 | + zero\r | |
1280 | one\r | |
1281 | ! tree\r | |
1282 | four\r | |
1283 | """\r | |
1284 | \r | |
1285 | prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ')\r | |
1286 | started = False\r | |
1287 | for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):\r | |
1288 | if not started:\r | |
1289 | started = True\r | |
1290 | fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''\r | |
1291 | todate = '\t{}'.format(tofiledate) if tofiledate else ''\r | |
1292 | yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)\r | |
1293 | yield '--- {}{}{}'.format(tofile, todate, lineterm)\r | |
1294 | \r | |
1295 | first, last = group[0], group[-1]\r | |
1296 | yield '***************' + lineterm\r | |
1297 | \r | |
1298 | file1_range = _format_range_context(first[1], last[2])\r | |
1299 | yield '*** {} ****{}'.format(file1_range, lineterm)\r | |
1300 | \r | |
1301 | if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group):\r | |
1302 | for tag, i1, i2, _, _ in group:\r | |
1303 | if tag != 'insert':\r | |
1304 | for line in a[i1:i2]:\r | |
1305 | yield prefix[tag] + line\r | |
1306 | \r | |
1307 | file2_range = _format_range_context(first[3], last[4])\r | |
1308 | yield '--- {} ----{}'.format(file2_range, lineterm)\r | |
1309 | \r | |
1310 | if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group):\r | |
1311 | for tag, _, _, j1, j2 in group:\r | |
1312 | if tag != 'delete':\r | |
1313 | for line in b[j1:j2]:\r | |
1314 | yield prefix[tag] + line\r | |
1315 | \r | |
1316 | def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):\r | |
1317 | r"""\r | |
1318 | Compare `a` and `b` (lists of strings); return a `Differ`-style delta.\r | |
1319 | \r | |
1320 | Optional keyword parameters `linejunk` and `charjunk` are for filter\r | |
1321 | functions (or None):\r | |
1322 | \r | |
1323 | - linejunk: A function that should accept a single string argument, and\r | |
1324 | return true iff the string is junk. The default is None, and is\r | |
1325 | recommended; as of Python 2.3, an adaptive notion of "noise" lines is\r | |
1326 | used that does a good job on its own.\r | |
1327 | \r | |
1328 | - charjunk: A function that should accept a string of length 1. The\r | |
1329 | default is module-level function IS_CHARACTER_JUNK, which filters out\r | |
1330 | whitespace characters (a blank or tab; note: bad idea to include newline\r | |
1331 | in this!).\r | |
1332 | \r | |
1333 | Tools/scripts/ndiff.py is a command-line front-end to this function.\r | |
1334 | \r | |
1335 | Example:\r | |
1336 | \r | |
1337 | >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),\r | |
1338 | ... 'ore\ntree\nemu\n'.splitlines(1))\r | |
1339 | >>> print ''.join(diff),\r | |
1340 | - one\r | |
1341 | ? ^\r | |
1342 | + ore\r | |
1343 | ? ^\r | |
1344 | - two\r | |
1345 | - three\r | |
1346 | ? -\r | |
1347 | + tree\r | |
1348 | + emu\r | |
1349 | """\r | |
1350 | return Differ(linejunk, charjunk).compare(a, b)\r | |
1351 | \r | |
1352 | def _mdiff(fromlines, tolines, context=None, linejunk=None,\r | |
1353 | charjunk=IS_CHARACTER_JUNK):\r | |
1354 | r"""Returns generator yielding marked up from/to side by side differences.\r | |
1355 | \r | |
1356 | Arguments:\r | |
1357 | fromlines -- list of text lines to compared to tolines\r | |
1358 | tolines -- list of text lines to be compared to fromlines\r | |
1359 | context -- number of context lines to display on each side of difference,\r | |
1360 | if None, all from/to text lines will be generated.\r | |
1361 | linejunk -- passed on to ndiff (see ndiff documentation)\r | |
1362 | charjunk -- passed on to ndiff (see ndiff documentation)\r | |
1363 | \r | |
1364 | This function returns an interator which returns a tuple:\r | |
1365 | (from line tuple, to line tuple, boolean flag)\r | |
1366 | \r | |
1367 | from/to line tuple -- (line num, line text)\r | |
1368 | line num -- integer or None (to indicate a context separation)\r | |
1369 | line text -- original line text with following markers inserted:\r | |
1370 | '\0+' -- marks start of added text\r | |
1371 | '\0-' -- marks start of deleted text\r | |
1372 | '\0^' -- marks start of changed text\r | |
1373 | '\1' -- marks end of added/deleted/changed text\r | |
1374 | \r | |
1375 | boolean flag -- None indicates context separation, True indicates\r | |
1376 | either "from" or "to" line contains a change, otherwise False.\r | |
1377 | \r | |
1378 | This function/iterator was originally developed to generate side by side\r | |
1379 | file difference for making HTML pages (see HtmlDiff class for example\r | |
1380 | usage).\r | |
1381 | \r | |
1382 | Note, this function utilizes the ndiff function to generate the side by\r | |
1383 | side difference markup. Optional ndiff arguments may be passed to this\r | |
1384 | function and they in turn will be passed to ndiff.\r | |
1385 | """\r | |
1386 | import re\r | |
1387 | \r | |
1388 | # regular expression for finding intraline change indices\r | |
1389 | change_re = re.compile('(\++|\-+|\^+)')\r | |
1390 | \r | |
1391 | # create the difference iterator to generate the differences\r | |
1392 | diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)\r | |
1393 | \r | |
1394 | def _make_line(lines, format_key, side, num_lines=[0,0]):\r | |
1395 | """Returns line of text with user's change markup and line formatting.\r | |
1396 | \r | |
1397 | lines -- list of lines from the ndiff generator to produce a line of\r | |
1398 | text from. When producing the line of text to return, the\r | |
1399 | lines used are removed from this list.\r | |
1400 | format_key -- '+' return first line in list with "add" markup around\r | |
1401 | the entire line.\r | |
1402 | '-' return first line in list with "delete" markup around\r | |
1403 | the entire line.\r | |
1404 | '?' return first line in list with add/delete/change\r | |
1405 | intraline markup (indices obtained from second line)\r | |
1406 | None return first line in list with no markup\r | |
1407 | side -- indice into the num_lines list (0=from,1=to)\r | |
1408 | num_lines -- from/to current line number. This is NOT intended to be a\r | |
1409 | passed parameter. It is present as a keyword argument to\r | |
1410 | maintain memory of the current line numbers between calls\r | |
1411 | of this function.\r | |
1412 | \r | |
1413 | Note, this function is purposefully not defined at the module scope so\r | |
1414 | that data it needs from its parent function (within whose context it\r | |
1415 | is defined) does not need to be of module scope.\r | |
1416 | """\r | |
1417 | num_lines[side] += 1\r | |
1418 | # Handle case where no user markup is to be added, just return line of\r | |
1419 | # text with user's line format to allow for usage of the line number.\r | |
1420 | if format_key is None:\r | |
1421 | return (num_lines[side],lines.pop(0)[2:])\r | |
1422 | # Handle case of intraline changes\r | |
1423 | if format_key == '?':\r | |
1424 | text, markers = lines.pop(0), lines.pop(0)\r | |
1425 | # find intraline changes (store change type and indices in tuples)\r | |
1426 | sub_info = []\r | |
1427 | def record_sub_info(match_object,sub_info=sub_info):\r | |
1428 | sub_info.append([match_object.group(1)[0],match_object.span()])\r | |
1429 | return match_object.group(1)\r | |
1430 | change_re.sub(record_sub_info,markers)\r | |
1431 | # process each tuple inserting our special marks that won't be\r | |
1432 | # noticed by an xml/html escaper.\r | |
1433 | for key,(begin,end) in sub_info[::-1]:\r | |
1434 | text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]\r | |
1435 | text = text[2:]\r | |
1436 | # Handle case of add/delete entire line\r | |
1437 | else:\r | |
1438 | text = lines.pop(0)[2:]\r | |
1439 | # if line of text is just a newline, insert a space so there is\r | |
1440 | # something for the user to highlight and see.\r | |
1441 | if not text:\r | |
1442 | text = ' '\r | |
1443 | # insert marks that won't be noticed by an xml/html escaper.\r | |
1444 | text = '\0' + format_key + text + '\1'\r | |
1445 | # Return line of text, first allow user's line formatter to do its\r | |
1446 | # thing (such as adding the line number) then replace the special\r | |
1447 | # marks with what the user's change markup.\r | |
1448 | return (num_lines[side],text)\r | |
1449 | \r | |
1450 | def _line_iterator():\r | |
1451 | """Yields from/to lines of text with a change indication.\r | |
1452 | \r | |
1453 | This function is an iterator. It itself pulls lines from a\r | |
1454 | differencing iterator, processes them and yields them. When it can\r | |
1455 | it yields both a "from" and a "to" line, otherwise it will yield one\r | |
1456 | or the other. In addition to yielding the lines of from/to text, a\r | |
1457 | boolean flag is yielded to indicate if the text line(s) have\r | |
1458 | differences in them.\r | |
1459 | \r | |
1460 | Note, this function is purposefully not defined at the module scope so\r | |
1461 | that data it needs from its parent function (within whose context it\r | |
1462 | is defined) does not need to be of module scope.\r | |
1463 | """\r | |
1464 | lines = []\r | |
1465 | num_blanks_pending, num_blanks_to_yield = 0, 0\r | |
1466 | while True:\r | |
1467 | # Load up next 4 lines so we can look ahead, create strings which\r | |
1468 | # are a concatenation of the first character of each of the 4 lines\r | |
1469 | # so we can do some very readable comparisons.\r | |
1470 | while len(lines) < 4:\r | |
1471 | try:\r | |
1472 | lines.append(diff_lines_iterator.next())\r | |
1473 | except StopIteration:\r | |
1474 | lines.append('X')\r | |
1475 | s = ''.join([line[0] for line in lines])\r | |
1476 | if s.startswith('X'):\r | |
1477 | # When no more lines, pump out any remaining blank lines so the\r | |
1478 | # corresponding add/delete lines get a matching blank line so\r | |
1479 | # all line pairs get yielded at the next level.\r | |
1480 | num_blanks_to_yield = num_blanks_pending\r | |
1481 | elif s.startswith('-?+?'):\r | |
1482 | # simple intraline change\r | |
1483 | yield _make_line(lines,'?',0), _make_line(lines,'?',1), True\r | |
1484 | continue\r | |
1485 | elif s.startswith('--++'):\r | |
1486 | # in delete block, add block coming: we do NOT want to get\r | |
1487 | # caught up on blank lines yet, just process the delete line\r | |
1488 | num_blanks_pending -= 1\r | |
1489 | yield _make_line(lines,'-',0), None, True\r | |
1490 | continue\r | |
1491 | elif s.startswith(('--?+', '--+', '- ')):\r | |
1492 | # in delete block and see a intraline change or unchanged line\r | |
1493 | # coming: yield the delete line and then blanks\r | |
1494 | from_line,to_line = _make_line(lines,'-',0), None\r | |
1495 | num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0\r | |
1496 | elif s.startswith('-+?'):\r | |
1497 | # intraline change\r | |
1498 | yield _make_line(lines,None,0), _make_line(lines,'?',1), True\r | |
1499 | continue\r | |
1500 | elif s.startswith('-?+'):\r | |
1501 | # intraline change\r | |
1502 | yield _make_line(lines,'?',0), _make_line(lines,None,1), True\r | |
1503 | continue\r | |
1504 | elif s.startswith('-'):\r | |
1505 | # delete FROM line\r | |
1506 | num_blanks_pending -= 1\r | |
1507 | yield _make_line(lines,'-',0), None, True\r | |
1508 | continue\r | |
1509 | elif s.startswith('+--'):\r | |
1510 | # in add block, delete block coming: we do NOT want to get\r | |
1511 | # caught up on blank lines yet, just process the add line\r | |
1512 | num_blanks_pending += 1\r | |
1513 | yield None, _make_line(lines,'+',1), True\r | |
1514 | continue\r | |
1515 | elif s.startswith(('+ ', '+-')):\r | |
1516 | # will be leaving an add block: yield blanks then add line\r | |
1517 | from_line, to_line = None, _make_line(lines,'+',1)\r | |
1518 | num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0\r | |
1519 | elif s.startswith('+'):\r | |
1520 | # inside an add block, yield the add line\r | |
1521 | num_blanks_pending += 1\r | |
1522 | yield None, _make_line(lines,'+',1), True\r | |
1523 | continue\r | |
1524 | elif s.startswith(' '):\r | |
1525 | # unchanged text, yield it to both sides\r | |
1526 | yield _make_line(lines[:],None,0),_make_line(lines,None,1),False\r | |
1527 | continue\r | |
1528 | # Catch up on the blank lines so when we yield the next from/to\r | |
1529 | # pair, they are lined up.\r | |
1530 | while(num_blanks_to_yield < 0):\r | |
1531 | num_blanks_to_yield += 1\r | |
1532 | yield None,('','\n'),True\r | |
1533 | while(num_blanks_to_yield > 0):\r | |
1534 | num_blanks_to_yield -= 1\r | |
1535 | yield ('','\n'),None,True\r | |
1536 | if s.startswith('X'):\r | |
1537 | raise StopIteration\r | |
1538 | else:\r | |
1539 | yield from_line,to_line,True\r | |
1540 | \r | |
1541 | def _line_pair_iterator():\r | |
1542 | """Yields from/to lines of text with a change indication.\r | |
1543 | \r | |
1544 | This function is an iterator. It itself pulls lines from the line\r | |
1545 | iterator. Its difference from that iterator is that this function\r | |
1546 | always yields a pair of from/to text lines (with the change\r | |
1547 | indication). If necessary it will collect single from/to lines\r | |
1548 | until it has a matching pair from/to pair to yield.\r | |
1549 | \r | |
1550 | Note, this function is purposefully not defined at the module scope so\r | |
1551 | that data it needs from its parent function (within whose context it\r | |
1552 | is defined) does not need to be of module scope.\r | |
1553 | """\r | |
1554 | line_iterator = _line_iterator()\r | |
1555 | fromlines,tolines=[],[]\r | |
1556 | while True:\r | |
1557 | # Collecting lines of text until we have a from/to pair\r | |
1558 | while (len(fromlines)==0 or len(tolines)==0):\r | |
1559 | from_line, to_line, found_diff =line_iterator.next()\r | |
1560 | if from_line is not None:\r | |
1561 | fromlines.append((from_line,found_diff))\r | |
1562 | if to_line is not None:\r | |
1563 | tolines.append((to_line,found_diff))\r | |
1564 | # Once we have a pair, remove them from the collection and yield it\r | |
1565 | from_line, fromDiff = fromlines.pop(0)\r | |
1566 | to_line, to_diff = tolines.pop(0)\r | |
1567 | yield (from_line,to_line,fromDiff or to_diff)\r | |
1568 | \r | |
1569 | # Handle case where user does not want context differencing, just yield\r | |
1570 | # them up without doing anything else with them.\r | |
1571 | line_pair_iterator = _line_pair_iterator()\r | |
1572 | if context is None:\r | |
1573 | while True:\r | |
1574 | yield line_pair_iterator.next()\r | |
1575 | # Handle case where user wants context differencing. We must do some\r | |
1576 | # storage of lines until we know for sure that they are to be yielded.\r | |
1577 | else:\r | |
1578 | context += 1\r | |
1579 | lines_to_write = 0\r | |
1580 | while True:\r | |
1581 | # Store lines up until we find a difference, note use of a\r | |
1582 | # circular queue because we only need to keep around what\r | |
1583 | # we need for context.\r | |
1584 | index, contextLines = 0, [None]*(context)\r | |
1585 | found_diff = False\r | |
1586 | while(found_diff is False):\r | |
1587 | from_line, to_line, found_diff = line_pair_iterator.next()\r | |
1588 | i = index % context\r | |
1589 | contextLines[i] = (from_line, to_line, found_diff)\r | |
1590 | index += 1\r | |
1591 | # Yield lines that we have collected so far, but first yield\r | |
1592 | # the user's separator.\r | |
1593 | if index > context:\r | |
1594 | yield None, None, None\r | |
1595 | lines_to_write = context\r | |
1596 | else:\r | |
1597 | lines_to_write = index\r | |
1598 | index = 0\r | |
1599 | while(lines_to_write):\r | |
1600 | i = index % context\r | |
1601 | index += 1\r | |
1602 | yield contextLines[i]\r | |
1603 | lines_to_write -= 1\r | |
1604 | # Now yield the context lines after the change\r | |
1605 | lines_to_write = context-1\r | |
1606 | while(lines_to_write):\r | |
1607 | from_line, to_line, found_diff = line_pair_iterator.next()\r | |
1608 | # If another change within the context, extend the context\r | |
1609 | if found_diff:\r | |
1610 | lines_to_write = context-1\r | |
1611 | else:\r | |
1612 | lines_to_write -= 1\r | |
1613 | yield from_line, to_line, found_diff\r | |
1614 | \r | |
1615 | \r | |
1616 | _file_template = """\r | |
1617 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\r | |
1618 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r | |
1619 | \r | |
1620 | <html>\r | |
1621 | \r | |
1622 | <head>\r | |
1623 | <meta http-equiv="Content-Type"\r | |
1624 | content="text/html; charset=ISO-8859-1" />\r | |
1625 | <title></title>\r | |
1626 | <style type="text/css">%(styles)s\r | |
1627 | </style>\r | |
1628 | </head>\r | |
1629 | \r | |
1630 | <body>\r | |
1631 | %(table)s%(legend)s\r | |
1632 | </body>\r | |
1633 | \r | |
1634 | </html>"""\r | |
1635 | \r | |
1636 | _styles = """\r | |
1637 | table.diff {font-family:Courier; border:medium;}\r | |
1638 | .diff_header {background-color:#e0e0e0}\r | |
1639 | td.diff_header {text-align:right}\r | |
1640 | .diff_next {background-color:#c0c0c0}\r | |
1641 | .diff_add {background-color:#aaffaa}\r | |
1642 | .diff_chg {background-color:#ffff77}\r | |
1643 | .diff_sub {background-color:#ffaaaa}"""\r | |
1644 | \r | |
1645 | _table_template = """\r | |
1646 | <table class="diff" id="difflib_chg_%(prefix)s_top"\r | |
1647 | cellspacing="0" cellpadding="0" rules="groups" >\r | |
1648 | <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>\r | |
1649 | <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>\r | |
1650 | %(header_row)s\r | |
1651 | <tbody>\r | |
1652 | %(data_rows)s </tbody>\r | |
1653 | </table>"""\r | |
1654 | \r | |
1655 | _legend = """\r | |
1656 | <table class="diff" summary="Legends">\r | |
1657 | <tr> <th colspan="2"> Legends </th> </tr>\r | |
1658 | <tr> <td> <table border="" summary="Colors">\r | |
1659 | <tr><th> Colors </th> </tr>\r | |
1660 | <tr><td class="diff_add"> Added </td></tr>\r | |
1661 | <tr><td class="diff_chg">Changed</td> </tr>\r | |
1662 | <tr><td class="diff_sub">Deleted</td> </tr>\r | |
1663 | </table></td>\r | |
1664 | <td> <table border="" summary="Links">\r | |
1665 | <tr><th colspan="2"> Links </th> </tr>\r | |
1666 | <tr><td>(f)irst change</td> </tr>\r | |
1667 | <tr><td>(n)ext change</td> </tr>\r | |
1668 | <tr><td>(t)op</td> </tr>\r | |
1669 | </table></td> </tr>\r | |
1670 | </table>"""\r | |
1671 | \r | |
1672 | class HtmlDiff(object):\r | |
1673 | """For producing HTML side by side comparison with change highlights.\r | |
1674 | \r | |
1675 | This class can be used to create an HTML table (or a complete HTML file\r | |
1676 | containing the table) showing a side by side, line by line comparison\r | |
1677 | of text with inter-line and intra-line change highlights. The table can\r | |
1678 | be generated in either full or contextual difference mode.\r | |
1679 | \r | |
1680 | The following methods are provided for HTML generation:\r | |
1681 | \r | |
1682 | make_table -- generates HTML for a single side by side table\r | |
1683 | make_file -- generates complete HTML file with a single side by side table\r | |
1684 | \r | |
1685 | See tools/scripts/diff.py for an example usage of this class.\r | |
1686 | """\r | |
1687 | \r | |
1688 | _file_template = _file_template\r | |
1689 | _styles = _styles\r | |
1690 | _table_template = _table_template\r | |
1691 | _legend = _legend\r | |
1692 | _default_prefix = 0\r | |
1693 | \r | |
1694 | def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,\r | |
1695 | charjunk=IS_CHARACTER_JUNK):\r | |
1696 | """HtmlDiff instance initializer\r | |
1697 | \r | |
1698 | Arguments:\r | |
1699 | tabsize -- tab stop spacing, defaults to 8.\r | |
1700 | wrapcolumn -- column number where lines are broken and wrapped,\r | |
1701 | defaults to None where lines are not wrapped.\r | |
1702 | linejunk,charjunk -- keyword arguments passed into ndiff() (used to by\r | |
1703 | HtmlDiff() to generate the side by side HTML differences). See\r | |
1704 | ndiff() documentation for argument default values and descriptions.\r | |
1705 | """\r | |
1706 | self._tabsize = tabsize\r | |
1707 | self._wrapcolumn = wrapcolumn\r | |
1708 | self._linejunk = linejunk\r | |
1709 | self._charjunk = charjunk\r | |
1710 | \r | |
1711 | def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,\r | |
1712 | numlines=5):\r | |
1713 | """Returns HTML file of side by side comparison with change highlights\r | |
1714 | \r | |
1715 | Arguments:\r | |
1716 | fromlines -- list of "from" lines\r | |
1717 | tolines -- list of "to" lines\r | |
1718 | fromdesc -- "from" file column header string\r | |
1719 | todesc -- "to" file column header string\r | |
1720 | context -- set to True for contextual differences (defaults to False\r | |
1721 | which shows full differences).\r | |
1722 | numlines -- number of context lines. When context is set True,\r | |
1723 | controls number of lines displayed before and after the change.\r | |
1724 | When context is False, controls the number of lines to place\r | |
1725 | the "next" link anchors before the next change (so click of\r | |
1726 | "next" link jumps to just before the change).\r | |
1727 | """\r | |
1728 | \r | |
1729 | return self._file_template % dict(\r | |
1730 | styles = self._styles,\r | |
1731 | legend = self._legend,\r | |
1732 | table = self.make_table(fromlines,tolines,fromdesc,todesc,\r | |
1733 | context=context,numlines=numlines))\r | |
1734 | \r | |
1735 | def _tab_newline_replace(self,fromlines,tolines):\r | |
1736 | """Returns from/to line lists with tabs expanded and newlines removed.\r | |
1737 | \r | |
1738 | Instead of tab characters being replaced by the number of spaces\r | |
1739 | needed to fill in to the next tab stop, this function will fill\r | |
1740 | the space with tab characters. This is done so that the difference\r | |
1741 | algorithms can identify changes in a file when tabs are replaced by\r | |
1742 | spaces and vice versa. At the end of the HTML generation, the tab\r | |
1743 | characters will be replaced with a nonbreakable space.\r | |
1744 | """\r | |
1745 | def expand_tabs(line):\r | |
1746 | # hide real spaces\r | |
1747 | line = line.replace(' ','\0')\r | |
1748 | # expand tabs into spaces\r | |
1749 | line = line.expandtabs(self._tabsize)\r | |
1750 | # replace spaces from expanded tabs back into tab characters\r | |
1751 | # (we'll replace them with markup after we do differencing)\r | |
1752 | line = line.replace(' ','\t')\r | |
1753 | return line.replace('\0',' ').rstrip('\n')\r | |
1754 | fromlines = [expand_tabs(line) for line in fromlines]\r | |
1755 | tolines = [expand_tabs(line) for line in tolines]\r | |
1756 | return fromlines,tolines\r | |
1757 | \r | |
1758 | def _split_line(self,data_list,line_num,text):\r | |
1759 | """Builds list of text lines by splitting text lines at wrap point\r | |
1760 | \r | |
1761 | This function will determine if the input text line needs to be\r | |
1762 | wrapped (split) into separate lines. If so, the first wrap point\r | |
1763 | will be determined and the first line appended to the output\r | |
1764 | text line list. This function is used recursively to handle\r | |
1765 | the second part of the split line to further split it.\r | |
1766 | """\r | |
1767 | # if blank line or context separator, just add it to the output list\r | |
1768 | if not line_num:\r | |
1769 | data_list.append((line_num,text))\r | |
1770 | return\r | |
1771 | \r | |
1772 | # if line text doesn't need wrapping, just add it to the output list\r | |
1773 | size = len(text)\r | |
1774 | max = self._wrapcolumn\r | |
1775 | if (size <= max) or ((size -(text.count('\0')*3)) <= max):\r | |
1776 | data_list.append((line_num,text))\r | |
1777 | return\r | |
1778 | \r | |
1779 | # scan text looking for the wrap point, keeping track if the wrap\r | |
1780 | # point is inside markers\r | |
1781 | i = 0\r | |
1782 | n = 0\r | |
1783 | mark = ''\r | |
1784 | while n < max and i < size:\r | |
1785 | if text[i] == '\0':\r | |
1786 | i += 1\r | |
1787 | mark = text[i]\r | |
1788 | i += 1\r | |
1789 | elif text[i] == '\1':\r | |
1790 | i += 1\r | |
1791 | mark = ''\r | |
1792 | else:\r | |
1793 | i += 1\r | |
1794 | n += 1\r | |
1795 | \r | |
1796 | # wrap point is inside text, break it up into separate lines\r | |
1797 | line1 = text[:i]\r | |
1798 | line2 = text[i:]\r | |
1799 | \r | |
1800 | # if wrap point is inside markers, place end marker at end of first\r | |
1801 | # line and start marker at beginning of second line because each\r | |
1802 | # line will have its own table tag markup around it.\r | |
1803 | if mark:\r | |
1804 | line1 = line1 + '\1'\r | |
1805 | line2 = '\0' + mark + line2\r | |
1806 | \r | |
1807 | # tack on first line onto the output list\r | |
1808 | data_list.append((line_num,line1))\r | |
1809 | \r | |
1810 | # use this routine again to wrap the remaining text\r | |
1811 | self._split_line(data_list,'>',line2)\r | |
1812 | \r | |
1813 | def _line_wrapper(self,diffs):\r | |
1814 | """Returns iterator that splits (wraps) mdiff text lines"""\r | |
1815 | \r | |
1816 | # pull from/to data and flags from mdiff iterator\r | |
1817 | for fromdata,todata,flag in diffs:\r | |
1818 | # check for context separators and pass them through\r | |
1819 | if flag is None:\r | |
1820 | yield fromdata,todata,flag\r | |
1821 | continue\r | |
1822 | (fromline,fromtext),(toline,totext) = fromdata,todata\r | |
1823 | # for each from/to line split it at the wrap column to form\r | |
1824 | # list of text lines.\r | |
1825 | fromlist,tolist = [],[]\r | |
1826 | self._split_line(fromlist,fromline,fromtext)\r | |
1827 | self._split_line(tolist,toline,totext)\r | |
1828 | # yield from/to line in pairs inserting blank lines as\r | |
1829 | # necessary when one side has more wrapped lines\r | |
1830 | while fromlist or tolist:\r | |
1831 | if fromlist:\r | |
1832 | fromdata = fromlist.pop(0)\r | |
1833 | else:\r | |
1834 | fromdata = ('',' ')\r | |
1835 | if tolist:\r | |
1836 | todata = tolist.pop(0)\r | |
1837 | else:\r | |
1838 | todata = ('',' ')\r | |
1839 | yield fromdata,todata,flag\r | |
1840 | \r | |
1841 | def _collect_lines(self,diffs):\r | |
1842 | """Collects mdiff output into separate lists\r | |
1843 | \r | |
1844 | Before storing the mdiff from/to data into a list, it is converted\r | |
1845 | into a single line of text with HTML markup.\r | |
1846 | """\r | |
1847 | \r | |
1848 | fromlist,tolist,flaglist = [],[],[]\r | |
1849 | # pull from/to data and flags from mdiff style iterator\r | |
1850 | for fromdata,todata,flag in diffs:\r | |
1851 | try:\r | |
1852 | # store HTML markup of the lines into the lists\r | |
1853 | fromlist.append(self._format_line(0,flag,*fromdata))\r | |
1854 | tolist.append(self._format_line(1,flag,*todata))\r | |
1855 | except TypeError:\r | |
1856 | # exceptions occur for lines where context separators go\r | |
1857 | fromlist.append(None)\r | |
1858 | tolist.append(None)\r | |
1859 | flaglist.append(flag)\r | |
1860 | return fromlist,tolist,flaglist\r | |
1861 | \r | |
1862 | def _format_line(self,side,flag,linenum,text):\r | |
1863 | """Returns HTML markup of "from" / "to" text lines\r | |
1864 | \r | |
1865 | side -- 0 or 1 indicating "from" or "to" text\r | |
1866 | flag -- indicates if difference on line\r | |
1867 | linenum -- line number (used for line number column)\r | |
1868 | text -- line text to be marked up\r | |
1869 | """\r | |
1870 | try:\r | |
1871 | linenum = '%d' % linenum\r | |
1872 | id = ' id="%s%s"' % (self._prefix[side],linenum)\r | |
1873 | except TypeError:\r | |
1874 | # handle blank lines where linenum is '>' or ''\r | |
1875 | id = ''\r | |
1876 | # replace those things that would get confused with HTML symbols\r | |
1877 | text=text.replace("&","&").replace(">",">").replace("<","<")\r | |
1878 | \r | |
1879 | # make space non-breakable so they don't get compressed or line wrapped\r | |
1880 | text = text.replace(' ',' ').rstrip()\r | |
1881 | \r | |
1882 | return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \\r | |
1883 | % (id,linenum,text)\r | |
1884 | \r | |
1885 | def _make_prefix(self):\r | |
1886 | """Create unique anchor prefixes"""\r | |
1887 | \r | |
1888 | # Generate a unique anchor prefix so multiple tables\r | |
1889 | # can exist on the same HTML page without conflicts.\r | |
1890 | fromprefix = "from%d_" % HtmlDiff._default_prefix\r | |
1891 | toprefix = "to%d_" % HtmlDiff._default_prefix\r | |
1892 | HtmlDiff._default_prefix += 1\r | |
1893 | # store prefixes so line format method has access\r | |
1894 | self._prefix = [fromprefix,toprefix]\r | |
1895 | \r | |
1896 | def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):\r | |
1897 | """Makes list of "next" links"""\r | |
1898 | \r | |
1899 | # all anchor names will be generated using the unique "to" prefix\r | |
1900 | toprefix = self._prefix[1]\r | |
1901 | \r | |
1902 | # process change flags, generating middle column of next anchors/links\r | |
1903 | next_id = ['']*len(flaglist)\r | |
1904 | next_href = ['']*len(flaglist)\r | |
1905 | num_chg, in_change = 0, False\r | |
1906 | last = 0\r | |
1907 | for i,flag in enumerate(flaglist):\r | |
1908 | if flag:\r | |
1909 | if not in_change:\r | |
1910 | in_change = True\r | |
1911 | last = i\r | |
1912 | # at the beginning of a change, drop an anchor a few lines\r | |
1913 | # (the context lines) before the change for the previous\r | |
1914 | # link\r | |
1915 | i = max([0,i-numlines])\r | |
1916 | next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)\r | |
1917 | # at the beginning of a change, drop a link to the next\r | |
1918 | # change\r | |
1919 | num_chg += 1\r | |
1920 | next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (\r | |
1921 | toprefix,num_chg)\r | |
1922 | else:\r | |
1923 | in_change = False\r | |
1924 | # check for cases where there is no content to avoid exceptions\r | |
1925 | if not flaglist:\r | |
1926 | flaglist = [False]\r | |
1927 | next_id = ['']\r | |
1928 | next_href = ['']\r | |
1929 | last = 0\r | |
1930 | if context:\r | |
1931 | fromlist = ['<td></td><td> No Differences Found </td>']\r | |
1932 | tolist = fromlist\r | |
1933 | else:\r | |
1934 | fromlist = tolist = ['<td></td><td> Empty File </td>']\r | |
1935 | # if not a change on first line, drop a link\r | |
1936 | if not flaglist[0]:\r | |
1937 | next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix\r | |
1938 | # redo the last link to link to the top\r | |
1939 | next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)\r | |
1940 | \r | |
1941 | return fromlist,tolist,flaglist,next_href,next_id\r | |
1942 | \r | |
1943 | def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,\r | |
1944 | numlines=5):\r | |
1945 | """Returns HTML table of side by side comparison with change highlights\r | |
1946 | \r | |
1947 | Arguments:\r | |
1948 | fromlines -- list of "from" lines\r | |
1949 | tolines -- list of "to" lines\r | |
1950 | fromdesc -- "from" file column header string\r | |
1951 | todesc -- "to" file column header string\r | |
1952 | context -- set to True for contextual differences (defaults to False\r | |
1953 | which shows full differences).\r | |
1954 | numlines -- number of context lines. When context is set True,\r | |
1955 | controls number of lines displayed before and after the change.\r | |
1956 | When context is False, controls the number of lines to place\r | |
1957 | the "next" link anchors before the next change (so click of\r | |
1958 | "next" link jumps to just before the change).\r | |
1959 | """\r | |
1960 | \r | |
1961 | # make unique anchor prefixes so that multiple tables may exist\r | |
1962 | # on the same page without conflict.\r | |
1963 | self._make_prefix()\r | |
1964 | \r | |
1965 | # change tabs to spaces before it gets more difficult after we insert\r | |
1966 | # markkup\r | |
1967 | fromlines,tolines = self._tab_newline_replace(fromlines,tolines)\r | |
1968 | \r | |
1969 | # create diffs iterator which generates side by side from/to data\r | |
1970 | if context:\r | |
1971 | context_lines = numlines\r | |
1972 | else:\r | |
1973 | context_lines = None\r | |
1974 | diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,\r | |
1975 | charjunk=self._charjunk)\r | |
1976 | \r | |
1977 | # set up iterator to wrap lines that exceed desired width\r | |
1978 | if self._wrapcolumn:\r | |
1979 | diffs = self._line_wrapper(diffs)\r | |
1980 | \r | |
1981 | # collect up from/to lines and flags into lists (also format the lines)\r | |
1982 | fromlist,tolist,flaglist = self._collect_lines(diffs)\r | |
1983 | \r | |
1984 | # process change flags, generating middle column of next anchors/links\r | |
1985 | fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(\r | |
1986 | fromlist,tolist,flaglist,context,numlines)\r | |
1987 | \r | |
1988 | s = []\r | |
1989 | fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \\r | |
1990 | '<td class="diff_next">%s</td>%s</tr>\n'\r | |
1991 | for i in range(len(flaglist)):\r | |
1992 | if flaglist[i] is None:\r | |
1993 | # mdiff yields None on separator lines skip the bogus ones\r | |
1994 | # generated for the first line\r | |
1995 | if i > 0:\r | |
1996 | s.append(' </tbody> \n <tbody>\n')\r | |
1997 | else:\r | |
1998 | s.append( fmt % (next_id[i],next_href[i],fromlist[i],\r | |
1999 | next_href[i],tolist[i]))\r | |
2000 | if fromdesc or todesc:\r | |
2001 | header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (\r | |
2002 | '<th class="diff_next"><br /></th>',\r | |
2003 | '<th colspan="2" class="diff_header">%s</th>' % fromdesc,\r | |
2004 | '<th class="diff_next"><br /></th>',\r | |
2005 | '<th colspan="2" class="diff_header">%s</th>' % todesc)\r | |
2006 | else:\r | |
2007 | header_row = ''\r | |
2008 | \r | |
2009 | table = self._table_template % dict(\r | |
2010 | data_rows=''.join(s),\r | |
2011 | header_row=header_row,\r | |
2012 | prefix=self._prefix[1])\r | |
2013 | \r | |
2014 | return table.replace('\0+','<span class="diff_add">'). \\r | |
2015 | replace('\0-','<span class="diff_sub">'). \\r | |
2016 | replace('\0^','<span class="diff_chg">'). \\r | |
2017 | replace('\1','</span>'). \\r | |
2018 | replace('\t',' ')\r | |
2019 | \r | |
2020 | del re\r | |
2021 | \r | |
2022 | def restore(delta, which):\r | |
2023 | r"""\r | |
2024 | Generate one of the two sequences that generated a delta.\r | |
2025 | \r | |
2026 | Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract\r | |
2027 | lines originating from file 1 or 2 (parameter `which`), stripping off line\r | |
2028 | prefixes.\r | |
2029 | \r | |
2030 | Examples:\r | |
2031 | \r | |
2032 | >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),\r | |
2033 | ... 'ore\ntree\nemu\n'.splitlines(1))\r | |
2034 | >>> diff = list(diff)\r | |
2035 | >>> print ''.join(restore(diff, 1)),\r | |
2036 | one\r | |
2037 | two\r | |
2038 | three\r | |
2039 | >>> print ''.join(restore(diff, 2)),\r | |
2040 | ore\r | |
2041 | tree\r | |
2042 | emu\r | |
2043 | """\r | |
2044 | try:\r | |
2045 | tag = {1: "- ", 2: "+ "}[int(which)]\r | |
2046 | except KeyError:\r | |
2047 | raise ValueError, ('unknown delta choice (must be 1 or 2): %r'\r | |
2048 | % which)\r | |
2049 | prefixes = (" ", tag)\r | |
2050 | for line in delta:\r | |
2051 | if line[:2] in prefixes:\r | |
2052 | yield line[2:]\r | |
2053 | \r | |
2054 | def _test():\r | |
2055 | import doctest, difflib\r | |
2056 | return doctest.testmod(difflib)\r | |
2057 | \r | |
2058 | if __name__ == "__main__":\r | |
2059 | _test()\r |