Line Flow Count Block(s) Source
1 - /*
2 - * LibXDiff by Davide Libenzi ( File Differential Library )
3 - * Copyright (C) 2003-2016 Davide Libenzi, Johannes E. Schindelin
4 - *
5 - * This library is free software; you can redistribute it and/or
6 - * modify it under the terms of the GNU Lesser General Public
7 - * License as published by the Free Software Foundation; either
8 - * version 2.1 of the License, or (at your option) any later version.
9 - *
10 - * This library is distributed in the hope that it will be useful,
11 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 - * Lesser General Public License for more details.
14 - *
15 - * You should have received a copy of the GNU Lesser General Public
16 - * License along with this library; if not, see
17 - * <http://www.gnu.org/licenses/>.
18 - *
19 - * Davide Libenzi <davidel@xmailserver.org>
20 - *
21 - */
22 - #include "xinclude.h"
23 - #include "xtypes.h"
24 - #include "xdiff.h"
25 -
26 - /*
27 - * The basic idea of patience diff is to find lines that are unique in
28 - * both files. These are intuitively the ones that we want to see as
29 - * common lines.
30 - *
31 - * The maximal ordered sequence of such line pairs (where ordered means
32 - * that the order in the sequence agrees with the order of the lines in
33 - * both files) naturally defines an initial set of common lines.
34 - *
35 - * Now, the algorithm tries to extend the set of common lines by growing
36 - * the line ranges where the files have identical lines.
37 - *
38 - * Between those common lines, the patience diff algorithm is applied
39 - * recursively, until no unique line pairs can be found; these line ranges
40 - * are handled by the well-known Myers algorithm.
41 - */
42 -
43 - #define NON_UNIQUE ULONG_MAX
44 -
45 - /*
46 - * This is a hash mapping from line hash to line numbers in the first and
47 - * second file.
48 - */
49 - struct hashmap {
50 - int nr, alloc;
51 - struct entry {
52 - unsigned long hash;
53 - /*
54 - * 0 = unused entry, 1 = first line, 2 = second, etc.
55 - * line2 is NON_UNIQUE if the line is not unique
56 - * in either the first or the second file.
57 - */
58 - unsigned long line1, line2;
59 - /*
60 - * "next" & "previous" are used for the longest common
61 - * sequence;
62 - * initially, "next" reflects only the order in file1.
63 - */
64 - struct entry *next, *previous;
65 -
66 - /*
67 - * If 1, this entry can serve as an anchor. See
68 - * Documentation/diff-options.txt for more information.
69 - */
70 - unsigned anchor : 1;
71 - } *entries, *first, *last;
72 - /* were common records found? */
73 - unsigned long has_matches;
74 - mmfile_t *file1, *file2;
75 - xdfenv_t *env;
76 - xpparam_t const *xpp;
77 - };
78 -
79 6 2 static int is_anchor(xpparam_t const *xpp, const char *line)
80 - {
81 - unsigned long i;
82 6 2,5,6 for (i = 0; i < xpp->anchors_nr; i++) {
83 ##### 3 if (!strncmp(line, xpp->anchors[i], strlen(xpp->anchors[i])))
84 ##### 4 return 1;
85 - }
86 6 7 return 0;
87 - }
88 -
89 - /* The argument "pass" is 1 for the first file, 2 for the second. */
90 17 2 static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map,
91 - int pass)
92 - {
93 17 5 xrecord_t **records = pass == 1 ?
94 17 2-4 map->env->xdf1.recs : map->env->xdf2.recs;
95 17 5 xrecord_t *record = records[line - 1], *other;
96 - /*
97 - * After xdl_prepare_env() (or more precisely, due to
98 - * xdl_classify_record()), the "ha" member of the records (AKA lines)
99 - * is _not_ the hash anymore, but a linearized version of it. In
100 - * other words, the "ha" member is guaranteed to start with 0 and
101 - * the second record's ha can only be 0 or 1, etc.
102 - *
103 - * So we multiply ha by 2 in the hope that the hashing was
104 - * "unique enough".
105 - */
106 17 5 int index = (int)((record->ha << 1) % map->alloc);
107 -
108 17 5,19 while (map->entries[index].line1) {
109 11 6 other = map->env->xdf1.recs[map->entries[index].line1 - 1];
110 11 6,8 if (map->entries[index].hash != record->ha ||
111 11 7 !xdl_recmatch(record->ptr, record->size,
112 - other->ptr, other->size,
113 11 7 map->xpp->flags)) {
114 ##### 9 if (++index >= map->alloc)
115 ##### 10 index = 0;
116 ##### 11 continue;
117 - }
118 11 12 if (pass == 2)
119 7 13 map->has_matches = 1;
120 11 14,15 if (pass == 1 || map->entries[index].line2)
121 7 16 map->entries[index].line2 = NON_UNIQUE;
122 - else
123 4 17 map->entries[index].line2 = line;
124 11 18 return;
125 - }
126 6 20 if (pass == 2)
127 ##### 21 return;
128 6 22 map->entries[index].line1 = line;
129 6 22 map->entries[index].hash = record->ha;
130 6 22 map->entries[index].anchor = is_anchor(xpp, map->env->xdf1.recs[line - 1]->ptr);
131 6 23 if (!map->first)
132 1 24 map->first = map->entries + index;
133 6 25 if (map->last) {
134 5 26 map->last->next = map->entries + index;
135 5 26 map->entries[index].previous = map->last;
136 - }
137 6 27 map->last = map->entries + index;
138 6 27 map->nr++;
139 - }
140 -
141 - /*
142 - * This function has to be called for each recursion into the inter-hunk
143 - * parts, as previously non-unique lines can become unique when being
144 - * restricted to a smaller part of the files.
145 - *
146 - * It is assumed that env has been prepared using xdl_prepare().
147 - */
148 1 2 static int fill_hashmap(mmfile_t *file1, mmfile_t *file2,
149 - xpparam_t const *xpp, xdfenv_t *env,
150 - struct hashmap *result,
151 - int line1, int count1, int line2, int count2)
152 - {
153 1 2 result->file1 = file1;
154 1 2 result->file2 = file2;
155 1 2 result->xpp = xpp;
156 1 2 result->env = env;
157 -
158 - /* We know exactly how large we want the hash map */
159 1 2 result->alloc = count1 * 2;
160 1 3 result->entries = (struct entry *)
161 1 2 xdl_malloc(result->alloc * sizeof(struct entry));
162 1 3 if (!result->entries)
163 ##### 4 return -1;
164 1 5 memset(result->entries, 0, result->alloc * sizeof(struct entry));
165 -
166 - /* First, fill with entries from the first file */
167 11 5,7 while (count1--)
168 10 6 insert_record(xpp, line1++, result, 1);
169 -
170 - /* Then search for matches in the second file */
171 8 8,10 while (count2--)
172 7 9 insert_record(xpp, line2++, result, 2);
173 -
174 1 11 return 0;
175 - }
176 -
177 - /*
178 - * Find the longest sequence with a smaller last element (meaning a smaller
179 - * line2, as we construct the sequence with entries ordered by line1).
180 - */
181 4 2 static int binary_search(struct entry **sequence, int longest,
182 - struct entry *entry)
183 - {
184 4 2 int left = -1, right = longest;
185 -
186 9 2,6 while (left + 1 < right) {
187 5 3 int middle = left + (right - left) / 2;
188 - /* by construction, no two entries can be equal */
189 5 3 if (sequence[middle]->line2 > entry->line2)
190 ##### 4 right = middle;
191 - else
192 5 5 left = middle;
193 - }
194 - /* return the index in "sequence", _not_ the sequence length */
195 4 7 return left;
196 - }
197 -
198 - /*
199 - * The idea is to start with the list of common unique lines sorted by
200 - * the order in file1. For each of these pairs, the longest (partial)
201 - * sequence whose last element's line2 is smaller is determined.
202 - *
203 - * For efficiency, the sequences are kept in a list containing exactly one
204 - * item per sequence length: the sequence with the smallest last
205 - * element (in terms of line2).
206 - */
207 1 2 static struct entry *find_longest_common_sequence(struct hashmap *map)
208 - {
209 1 2 struct entry **sequence = xdl_malloc(map->nr * sizeof(struct entry *));
210 1 3 int longest = 0, i;
211 - struct entry *entry;
212 -
213 - /*
214 - * If not -1, this entry in sequence must never be overridden.
215 - * Therefore, overriding entries before this has no effect, so
216 - * do not do that either.
217 - */
218 1 3 int anchor_i = -1;
219 -
220 1 3 if (!sequence)
221 ##### 4 return NULL;
222 -
223 7 5,19,20 for (entry = map->first; entry; entry = entry->next) {
224 6 6,7 if (!entry->line2 || entry->line2 == NON_UNIQUE)
225 2 8 continue;
226 4 9 i = binary_search(sequence, longest, entry);
227 4 10-12 entry->previous = i < 0 ? NULL : sequence[i];
228 4 13 ++i;
229 4 13 if (i <= anchor_i)
230 ##### 14 continue;
231 4 15 sequence[i] = entry;
232 4 15 if (entry->anchor) {
233 ##### 16 anchor_i = i;
234 ##### 16 longest = anchor_i + 1;
235 4 17 } else if (i == longest) {
236 4 18 longest++;
237 - }
238 - }
239 -
240 - /* No common unique lines were found */
241 1 21 if (!longest) {
242 ##### 22 xdl_free(sequence);
243 ##### 23 return NULL;
244 - }
245 -
246 - /* Iterate starting at the last element, adjusting the "next" members */
247 1 24 entry = sequence[longest - 1];
248 1 24 entry->next = NULL;
249 4 24,26 while (entry->previous) {
250 3 25 entry->previous->next = entry;
251 3 25 entry = entry->previous;
252 - }
253 1 27 xdl_free(sequence);
254 1 28 return entry;
255 - }
256 -
257 2 2 static int match(struct hashmap *map, int line1, int line2)
258 - {
259 2 2 xrecord_t *record1 = map->env->xdf1.recs[line1 - 1];
260 2 2 xrecord_t *record2 = map->env->xdf2.recs[line2 - 1];
261 2 2 return xdl_recmatch(record1->ptr, record1->size,
262 2 2 record2->ptr, record2->size, map->xpp->flags);
263 - }
264 -
265 - static int patience_diff(mmfile_t *file1, mmfile_t *file2,
266 - xpparam_t const *xpp, xdfenv_t *env,
267 - int line1, int count1, int line2, int count2);
268 -
269 1 2 static int walk_common_sequence(struct hashmap *map, struct entry *first,
270 - int line1, int count1, int line2, int count2)
271 - {
272 1 2 int end1 = line1 + count1, end2 = line2 + count2;
273 - int next1, next2;
274 -
275 - for (;;) {
276 - /* Try to grow the line ranges of common lines */
277 5 3 if (first) {
278 4 4 next1 = first->line1;
279 4 4 next2 = first->line2;
280 6 4,6,7,9 while (next1 > line1 && next2 > line2 &&
281 2 8 match(map, next1 - 1, next2 - 1)) {
282 2 5 next1--;
283 2 5 next2--;
284 - }
285 - } else {
286 1 10 next1 = end1;
287 1 10 next2 = end2;
288 - }
289 5 11,13,14,16 while (line1 < next1 && line2 < next2 &&
290 ##### 15 match(map, line1, line2)) {
291 ##### 12 line1++;
292 ##### 12 line2++;
293 - }
294 -
295 - /* Recurse */
296 5 17,18 if (next1 > line1 || next2 > line2) {
297 - struct hashmap submap;
298 -
299 3 19 memset(&submap, 0, sizeof(submap));
300 3 19,20 if (patience_diff(map->file1, map->file2,
301 - map->xpp, map->env,
302 - line1, next1 - line1,
303 - line2, next2 - line2))
304 3 21,22 return -1;
305 - }
306 -
307 5 23 if (!first)
308 1 24 return 0;
309 -
310 4 25,27,28 while (first->next &&
311 3 28,29 first->next->line1 == first->line1 + 1 &&
312 ##### 29 first->next->line2 == first->line2 + 1)
313 ##### 26 first = first->next;
314 -
315 4 30 line1 = first->line1 + 1;
316 4 30 line2 = first->line2 + 1;
317 -
318 4 30 first = first->next;
319 4 30 }
320 - }
321 -
322 ##### 2 static int fall_back_to_classic_diff(struct hashmap *map,
323 - int line1, int count1, int line2, int count2)
324 - {
325 - xpparam_t xpp;
326 ##### 2 xpp.flags = map->xpp->flags & ~XDF_DIFF_ALGORITHM_MASK;
327 -
328 ##### 2 return xdl_fall_back_diff(map->env, &xpp,
329 - line1, count1, line2, count2);
330 - }
331 -
332 - /*
333 - * Recursively find the longest common sequence of unique lines,
334 - * and if none was found, ask xdl_do_diff() to do the job.
335 - *
336 - * This function assumes that env was prepared with xdl_prepare_env().
337 - */
338 4 2 static int patience_diff(mmfile_t *file1, mmfile_t *file2,
339 - xpparam_t const *xpp, xdfenv_t *env,
340 - int line1, int count1, int line2, int count2)
341 - {
342 - struct hashmap map;
343 - struct entry *first;
344 4 2 int result = 0;
345 -
346 - /* trivial case: one side is empty */
347 4 2 if (!count1) {
348 2 3,5 while(count2--)
349 1 4 env->xdf2.rchg[line2++ - 1] = 1;
350 1 6 return 0;
351 3 7 } else if (!count2) {
352 6 8,10 while(count1--)
353 4 9 env->xdf1.rchg[line1++ - 1] = 1;
354 2 11 return 0;
355 - }
356 -
357 1 12 memset(&map, 0, sizeof(map));
358 1 12,13 if (fill_hashmap(file1, file2, xpp, env, &map,
359 - line1, count1, line2, count2))
360 ##### 14 return -1;
361 -
362 - /* are there any matching lines at all? */
363 1 15 if (!map.has_matches) {
364 ##### 16,18 while(count1--)
365 ##### 17 env->xdf1.rchg[line1++ - 1] = 1;
366 ##### 19,21 while(count2--)
367 ##### 20 env->xdf2.rchg[line2++ - 1] = 1;
368 ##### 22 xdl_free(map.entries);
369 ##### 23 return 0;
370 - }
371 -
372 1 24 first = find_longest_common_sequence(&map);
373 1 25 if (first)
374 1 26 result = walk_common_sequence(&map, first,
375 - line1, count1, line2, count2);
376 - else
377 ##### 27 result = fall_back_to_classic_diff(&map,
378 - line1, count1, line2, count2);
379 -
380 1 28 xdl_free(map.entries);
381 1 29 return result;
382 - }
383 -
384 1 2 int xdl_do_patience_diff(mmfile_t *file1, mmfile_t *file2,
385 - xpparam_t const *xpp, xdfenv_t *env)
386 - {
387 1 2,3 if (xdl_prepare_env(file1, file2, xpp, env) < 0)
388 ##### 4 return -1;
389 -
390 - /* environment is cleaned up in xdl_diff() */
391 1 5,5 return patience_diff(file1, file2, xpp, env,
392 1 5,5 1, env->xdf1.nrec, 1, env->xdf2.nrec);
393 - }