Line Flow Count Block(s) Source
1 - /*
2 - * LibXDiff by Davide Libenzi ( File Differential Library )
3 - * Copyright (C) 2003 Davide Libenzi
4 - *
5 - * This library is free software; you can redistribute it and/or
6 - * modify it under the terms of the GNU Lesser General Public
7 - * License as published by the Free Software Foundation; either
8 - * version 2.1 of the License, or (at your option) any later version.
9 - *
10 - * This library is distributed in the hope that it will be useful,
11 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 - * Lesser General Public License for more details.
14 - *
15 - * You should have received a copy of the GNU Lesser General Public
16 - * License along with this library; if not, see
17 - * <http://www.gnu.org/licenses/>.
18 - *
19 - * Davide Libenzi <davidel@xmailserver.org>
20 - *
21 - */
22 -
23 - #include "xinclude.h"
24 - #include "integer.h"
25 -
26 -
27 - #define XDL_MAX_COST_MIN 256
28 - #define XDL_HEUR_MIN_COST 256
29 - #define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
30 - #define XDL_SNAKE_CNT 20
31 - #define XDL_K_HEUR 4
32 -
33 - /** Declare a function as always inlined. */
34 - #if defined(_MSC_VER)
35 - # define XDL_INLINE(type) static __inline type
36 - #elif defined(__GNUC__)
37 - # define XDL_INLINE(type) static __inline__ type
38 - #else
39 - # define XDL_INLINE(type) static type
40 - #endif
41 -
42 - typedef struct s_xdpsplit {
43 - long i1, i2;
44 - int min_lo, min_hi;
45 - } xdpsplit_t;
46 -
47 -
48 -
49 -
50 - static long xdl_split(unsigned long const *ha1, long off1, long lim1,
51 - unsigned long const *ha2, long off2, long lim2,
52 - long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
53 - xdalgoenv_t *xenv);
54 - static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2);
55 -
56 -
57 -
58 -
59 -
60 - /*
61 - * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
62 - * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
63 - * the forward diagonal starting from (off1, off2) and the backward diagonal
64 - * starting from (lim1, lim2). If the K values on the same diagonal crosses
65 - * returns the furthest point of reach. We might end up having to expensive
66 - * cases using this algorithm is full, so a little bit of heuristic is needed
67 - * to cut the search and to return a suboptimal point.
68 - */
69 177 2 static long xdl_split(unsigned long const *ha1, long off1, long lim1,
70 - unsigned long const *ha2, long off2, long lim2,
71 - long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
72 - xdalgoenv_t *xenv) {
73 177 2 long dmin = off1 - lim2, dmax = lim1 - off2;
74 177 2 long fmid = off1 - off2, bmid = lim1 - lim2;
75 177 2 long odd = (fmid - bmid) & 1;
76 177 2 long fmin = fmid, fmax = fmid;
77 177 2 long bmin = bmid, bmax = bmid;
78 - long ec, d, i1, i2, prev1, best, dd, v, k;
79 -
80 - /*
81 - * Set initial diagonal values for both forward and backward path.
82 - */
83 177 2 kvdf[fmid] = off1;
84 177 2 kvdb[bmid] = lim1;
85 -
86 377 2,112 for (ec = 1;; ec++) {
87 554 3 int got_snake = 0;
88 -
89 - /*
90 - * We need to extent the diagonal "domain" by one. If the next
91 - * values exits the box boundaries we need to change it in the
92 - * opposite direction because (max - min) must be a power of two.
93 - * Also we initialize the external K value to -1 so that we can
94 - * avoid extra conditions check inside the core loop.
95 - */
96 554 3 if (fmin > dmin)
97 547 4 kvdf[--fmin - 1] = -1;
98 - else
99 7 5 ++fmin;
100 554 6 if (fmax < dmax)
101 526 7 kvdf[++fmax + 1] = -1;
102 - else
103 28 8 --fmax;
104 -
105 2637 9,25,26 for (d = fmax; d >= fmin; d -= 2) {
106 2157 10 if (kvdf[d - 1] >= kvdf[d + 1])
107 565 11 i1 = kvdf[d - 1] + 1;
108 - else
109 1592 12 i1 = kvdf[d + 1];
110 2157 13 prev1 = i1;
111 2157 13 i2 = i1 - d;
112 2892 13-17 for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
113 2157 18 if (i1 - prev1 > xenv->snake_cnt)
114 4 19 got_snake = 1;
115 2157 20 kvdf[d] = i1;
116 2157 20-23 if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
117 74 24 spl->i1 = i1;
118 74 24 spl->i2 = i2;
119 74 24 spl->min_lo = spl->min_hi = 1;
120 74 24 return ec;
121 - }
122 - }
123 -
124 - /*
125 - * We need to extent the diagonal "domain" by one. If the next
126 - * values exits the box boundaries we need to change it in the
127 - * opposite direction because (max - min) must be a power of two.
128 - * Also we initialize the external K value to -1 so that we can
129 - * avoid extra conditions check inside the core loop.
130 - */
131 480 27 if (bmin > dmin)
132 473 28 kvdb[--bmin - 1] = XDL_LINE_MAX;
133 - else
134 7 29 ++bmin;
135 480 30 if (bmax < dmax)
136 475 31 kvdb[++bmax + 1] = XDL_LINE_MAX;
137 - else
138 5 32 --bmax;
139 -
140 2155 33,49,50 for (d = bmax; d >= bmin; d -= 2) {
141 1778 34 if (kvdb[d - 1] < kvdb[d + 1])
142 1333 35 i1 = kvdb[d - 1];
143 - else
144 445 36 i1 = kvdb[d + 1] - 1;
145 1778 37 prev1 = i1;
146 1778 37 i2 = i1 - d;
147 2446 37-41 for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
148 1778 42 if (prev1 - i1 > xenv->snake_cnt)
149 2 43 got_snake = 1;
150 1778 44 kvdb[d] = i1;
151 1778 44-47 if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
152 103 48 spl->i1 = i1;
153 103 48 spl->i2 = i2;
154 103 48 spl->min_lo = spl->min_hi = 1;
155 103 48 return ec;
156 - }
157 - }
158 -
159 377 51 if (need_min)
160 173 52 continue;
161 -
162 - /*
163 - * If the edit cost is above the heuristic trigger and if
164 - * we got a good snake, we sample current diagonals to see
165 - * if some of the, have reached an "interesting" path. Our
166 - * measure is a function of the distance from the diagonal
167 - * corner (i1 + i2) penalized with the distance from the
168 - * mid diagonal itself. If this value is above the current
169 - * edit cost times a magic factor (XDL_K_HEUR) we consider
170 - * it interesting.
171 - */
172 204 53,54 if (got_snake && ec > xenv->heur_min) {
173 ##### 55,70,71 for (best = 0, d = fmax; d >= fmin; d -= 2) {
174 ##### 56-58 dd = d > fmid ? d - fmid: fmid - d;
175 ##### 59 i1 = kvdf[d];
176 ##### 59 i2 = i1 - d;
177 ##### 59 v = (i1 - off1) + (i2 - off2) - dd;
178 -
179 ##### 59-61 if (v > XDL_K_HEUR * ec && v > best &&
180 ##### 61-63 off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
181 ##### 63,64 off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
182 ##### 65,68,69 for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
183 ##### 66 if (k == xenv->snake_cnt) {
184 ##### 67 best = v;
185 ##### 67 spl->i1 = i1;
186 ##### 67 spl->i2 = i2;
187 ##### 67 break;
188 - }
189 - }
190 - }
191 ##### 72 if (best > 0) {
192 ##### 73 spl->min_lo = 1;
193 ##### 73 spl->min_hi = 0;
194 ##### 73 return ec;
195 - }
196 -
197 ##### 74,89,90 for (best = 0, d = bmax; d >= bmin; d -= 2) {
198 ##### 75-77 dd = d > bmid ? d - bmid: bmid - d;
199 ##### 78 i1 = kvdb[d];
200 ##### 78 i2 = i1 - d;
201 ##### 78 v = (lim1 - i1) + (lim2 - i2) - dd;
202 -
203 ##### 78-80 if (v > XDL_K_HEUR * ec && v > best &&
204 ##### 81,82 off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
205 ##### 83 off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
206 ##### 84,87,88 for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
207 ##### 85 if (k == xenv->snake_cnt - 1) {
208 ##### 86 best = v;
209 ##### 86 spl->i1 = i1;
210 ##### 86 spl->i2 = i2;
211 ##### 86 break;
212 - }
213 - }
214 - }
215 ##### 91 if (best > 0) {
216 ##### 92 spl->min_lo = 0;
217 ##### 92 spl->min_hi = 1;
218 ##### 92 return ec;
219 - }
220 - }
221 -
222 - /*
223 - * Enough is enough. We spent too much time here and now we collect
224 - * the furthest reaching path using the (i1 + i2) measure.
225 - */
226 204 93 if (ec >= xenv->mxcost) {
227 - long fbest, fbest1, bbest, bbest1;
228 -
229 ##### 94 fbest = fbest1 = -1;
230 ##### 94,99,100 for (d = fmax; d >= fmin; d -= 2) {
231 ##### 95 i1 = XDL_MIN(kvdf[d], lim1);
232 ##### 95 i2 = i1 - d;
233 ##### 95 if (lim2 < i2)
234 ##### 96 i1 = lim2 + d, i2 = lim2;
235 ##### 97 if (fbest < i1 + i2) {
236 ##### 98 fbest = i1 + i2;
237 ##### 98 fbest1 = i1;
238 - }
239 - }
240 -
241 ##### 101 bbest = bbest1 = XDL_LINE_MAX;
242 ##### 101,106,107 for (d = bmax; d >= bmin; d -= 2) {
243 ##### 102 i1 = XDL_MAX(off1, kvdb[d]);
244 ##### 102 i2 = i1 - d;
245 ##### 102 if (i2 < off2)
246 ##### 103 i1 = off2 + d, i2 = off2;
247 ##### 104 if (i1 + i2 < bbest) {
248 ##### 105 bbest = i1 + i2;
249 ##### 105 bbest1 = i1;
250 - }
251 - }
252 -
253 ##### 108 if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
254 ##### 109 spl->i1 = fbest1;
255 ##### 109 spl->i2 = fbest - fbest1;
256 ##### 109 spl->min_lo = 1;
257 ##### 109 spl->min_hi = 0;
258 - } else {
259 ##### 110 spl->i1 = bbest1;
260 ##### 110 spl->i2 = bbest - bbest1;
261 ##### 110 spl->min_lo = 0;
262 ##### 110 spl->min_hi = 1;
263 - }
264 ##### 111 return ec;
265 - }
266 377 112 }
267 - }
268 -
269 -
270 - /*
271 - * Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling
272 - * the box splitting function. Note that the real job (marking changed lines)
273 - * is done in the two boundary reaching checks.
274 - */
275 2476 2 int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
276 - diffdata_t *dd2, long off2, long lim2,
277 - long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
278 2476 2 unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha;
279 -
280 - /*
281 - * Shrink the box by walking through each diagonal snake (SW and NE).
282 - */
283 4413 2-6 for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
284 3003 7-11 for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
285 -
286 - /*
287 - * If one dimension is empty, then all records on the other one must
288 - * be obviously changed.
289 - */
290 2476 12 if (off1 == lim1) {
291 1882 13 char *rchg2 = dd2->rchg;
292 1882 13 long *rindex2 = dd2->rindex;
293 -
294 2485 13-16 for (; off2 < lim2; off2++)
295 603 14 rchg2[rindex2[off2]] = 1;
296 594 17 } else if (off2 == lim2) {
297 417 18 char *rchg1 = dd1->rchg;
298 417 18 long *rindex1 = dd1->rindex;
299 -
300 1082 18-21 for (; off1 < lim1; off1++)
301 665 19 rchg1[rindex1[off1]] = 1;
302 - } else {
303 - xdpsplit_t spl;
304 177 22 spl.i1 = spl.i2 = 0;
305 -
306 - /*
307 - * Divide ...
308 - */
309 177 22,23 if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
310 - need_min, &spl, xenv) < 0) {
311 -
312 ##### 24,31 return -1;
313 - }
314 -
315 - /*
316 - * ... et Impera.
317 - */
318 177 25,26 if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
319 177 28 kvdf, kvdb, spl.min_lo, xenv) < 0 ||
320 177 27 xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
321 - kvdf, kvdb, spl.min_hi, xenv) < 0) {
322 -
323 177 29,30 return -1;
324 - }
325 - }
326 -
327 2476 32 return 0;
328 - }
329 -
330 -
331 2123 2 int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
332 - xdfenv_t *xe) {
333 - size_t ndiags, allocsize;
334 - long *kvd, *kvdf, *kvdb;
335 - xdalgoenv_t xenv;
336 - diffdata_t dd1, dd2;
337 -
338 2123 2 if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF)
339 1 3 return xdl_do_patience_diff(mf1, mf2, xpp, xe);
340 -
341 2122 4 if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
342 ##### 5 return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
343 -
344 2122 6,7 if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
345 -
346 ##### 8 return -1;
347 - }
348 -
349 - /*
350 - * Allocate and setup K vectors to be used by the differential algorithm.
351 - * One is to store the forward path and one to store the backward path.
352 - */
353 2122 9-15 GIT_ERROR_CHECK_ALLOC_ADD3(&ndiags, xe->xdf1.nreff, xe->xdf2.nreff, 3);
354 2122 16-22 GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, ndiags, 2);
355 2122 23-29 GIT_ERROR_CHECK_ALLOC_ADD(&allocsize, allocsize, 2);
356 2122 30-36 GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, allocsize, sizeof(long));
357 -
358 2122 37,38 if (!(kvd = (long *) xdl_malloc(allocsize))) {
359 ##### 39 xdl_free_env(xe);
360 ##### 40 return -1;
361 - }
362 2122 41 kvdf = kvd;
363 2122 41 kvdb = kvdf + ndiags;
364 2122 41 kvdf += xe->xdf2.nreff + 1;
365 2122 41 kvdb += xe->xdf2.nreff + 1;
366 -
367 2122 41 xenv.mxcost = xdl_bogosqrt(ndiags);
368 2122 42 if (xenv.mxcost < XDL_MAX_COST_MIN)
369 2122 43 xenv.mxcost = XDL_MAX_COST_MIN;
370 2122 44 xenv.snake_cnt = XDL_SNAKE_CNT;
371 2122 44 xenv.heur_min = XDL_HEUR_MIN_COST;
372 -
373 2122 44 dd1.nrec = xe->xdf1.nreff;
374 2122 44 dd1.ha = xe->xdf1.ha;
375 2122 44 dd1.rchg = xe->xdf1.rchg;
376 2122 44 dd1.rindex = xe->xdf1.rindex;
377 2122 44 dd2.nrec = xe->xdf2.nreff;
378 2122 44 dd2.ha = xe->xdf2.ha;
379 2122 44 dd2.rchg = xe->xdf2.rchg;
380 2122 44 dd2.rindex = xe->xdf2.rindex;
381 -
382 2122 44,45 if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
383 2122 44 kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
384 -
385 ##### 46 xdl_free(kvd);
386 ##### 47 xdl_free_env(xe);
387 ##### 48 return -1;
388 - }
389 -
390 2122 49 xdl_free(kvd);
391 -
392 2122 50 return 0;
393 - }
394 -
395 -
396 2570 2 static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
397 - xdchange_t *xch;
398 -
399 2570 2,3 if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
400 ##### 4 return NULL;
401 -
402 2570 5 xch->next = xscr;
403 2570 5 xch->i1 = i1;
404 2570 5 xch->i2 = i2;
405 2570 5 xch->chg1 = chg1;
406 2570 5 xch->chg2 = chg2;
407 2570 5 xch->ignore = 0;
408 -
409 2570 5 return xch;
410 - }
411 -
412 -
413 8817 2 static int recs_match(xrecord_t *rec1, xrecord_t *rec2, long flags)
414 - {
415 8817 2,4 return (rec1->ha == rec2->ha &&
416 5678 3 xdl_recmatch(rec1->ptr, rec1->size,
417 - rec2->ptr, rec2->size,
418 - flags));
419 - }
420 -
421 - /*
422 - * If a line is indented more than this, get_indent() just returns this value.
423 - * This avoids having to do absurd amounts of work for data that are not
424 - * human-readable text, and also ensures that the output of get_indent fits within
425 - * an int.
426 - */
427 - #define MAX_INDENT 200
428 -
429 - /*
430 - * Return the amount of indentation of the specified line, treating TAB as 8
431 - * columns. Return -1 if line is empty or contains only whitespace. Clamp the
432 - * output value at MAX_INDENT.
433 - */
434 ##### 2 static int get_indent(xrecord_t *rec)
435 - {
436 - long i;
437 ##### 2 int ret = 0;
438 -
439 ##### 2,12,13 for (i = 0; i < rec->size; i++) {
440 ##### 3 char c = rec->ptr[i];
441 -
442 ##### 3,4 if (!XDL_ISSPACE(c))
443 ##### 5 return ret;
444 ##### 6 else if (c == ' ')
445 ##### 7 ret += 1;
446 ##### 8 else if (c == '\t')
447 ##### 9 ret += 8 - ret % 8;
448 - /* ignore other whitespace characters */
449 -
450 ##### 10 if (ret >= MAX_INDENT)
451 ##### 11 return MAX_INDENT;
452 - }
453 -
454 - /* The line contains only whitespace. */
455 ##### 14 return -1;
456 - }
457 -
458 - /*
459 - * If more than this number of consecutive blank rows are found, just return this
460 - * value. This avoids requiring O(N^2) work for pathological cases, and also
461 - * ensures that the output of score_split fits in an int.
462 - */
463 - #define MAX_BLANKS 20
464 -
465 - /* Characteristics measured about a hypothetical split position. */
466 - struct split_measurement {
467 - /*
468 - * Is the split at the end of the file (aside from any blank lines)?
469 - */
470 - int end_of_file;
471 -
472 - /*
473 - * How much is the line immediately following the split indented (or -1 if
474 - * the line is blank):
475 - */
476 - int indent;
477 -
478 - /*
479 - * How many consecutive lines above the split are blank?
480 - */
481 - int pre_blank;
482 -
483 - /*
484 - * How much is the nearest non-blank line above the split indented (or -1
485 - * if there is no such line)?
486 - */
487 - int pre_indent;
488 -
489 - /*
490 - * How many lines after the line following the split are blank?
491 - */
492 - int post_blank;
493 -
494 - /*
495 - * How much is the nearest non-blank line after the line following the
496 - * split indented (or -1 if there is no such line)?
497 - */
498 - int post_indent;
499 - };
500 -
501 - struct split_score {
502 - /* The effective indent of this split (smaller is preferred). */
503 - int effective_indent;
504 -
505 - /* Penalty for this split (smaller is preferred). */
506 - int penalty;
507 - };
508 -
509 - /*
510 - * Fill m with information about a hypothetical split of xdf above line split.
511 - */
512 ##### 2 static void measure_split(const xdfile_t *xdf, long split,
513 - struct split_measurement *m)
514 - {
515 - long i;
516 -
517 ##### 2 if (split >= xdf->nrec) {
518 ##### 3 m->end_of_file = 1;
519 ##### 3 m->indent = -1;
520 - } else {
521 ##### 4 m->end_of_file = 0;
522 ##### 4,5 m->indent = get_indent(xdf->recs[split]);
523 - }
524 -
525 ##### 6 m->pre_blank = 0;
526 ##### 6 m->pre_indent = -1;
527 ##### 6,12,13 for (i = split - 1; i >= 0; i--) {
528 ##### 7 m->pre_indent = get_indent(xdf->recs[i]);
529 ##### 8 if (m->pre_indent != -1)
530 ##### 9 break;
531 ##### 10 m->pre_blank += 1;
532 ##### 10 if (m->pre_blank == MAX_BLANKS) {
533 ##### 11 m->pre_indent = 0;
534 ##### 11 break;
535 - }
536 - }
537 -
538 ##### 14 m->post_blank = 0;
539 ##### 14 m->post_indent = -1;
540 ##### 14,20,21 for (i = split + 1; i < xdf->nrec; i++) {
541 ##### 15 m->post_indent = get_indent(xdf->recs[i]);
542 ##### 16 if (m->post_indent != -1)
543 ##### 17 break;
544 ##### 18 m->post_blank += 1;
545 ##### 18 if (m->post_blank == MAX_BLANKS) {
546 ##### 19 m->post_indent = 0;
547 ##### 19 break;
548 - }
549 - }
550 ##### 22 }
551 -
552 - /*
553 - * The empirically-determined weight factors used by score_split() below.
554 - * Larger values means that the position is a less favorable place to split.
555 - *
556 - * Note that scores are only ever compared against each other, so multiplying
557 - * all of these weight/penalty values by the same factor wouldn't change the
558 - * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*.
559 - * In practice, these numbers are chosen to be large enough that they can be
560 - * adjusted relative to each other with sufficient precision despite using
561 - * integer math.
562 - */
563 -
564 - /* Penalty if there are no non-blank lines before the split */
565 - #define START_OF_FILE_PENALTY 1
566 -
567 - /* Penalty if there are no non-blank lines after the split */
568 - #define END_OF_FILE_PENALTY 21
569 -
570 - /* Multiplier for the number of blank lines around the split */
571 - #define TOTAL_BLANK_WEIGHT (-30)
572 -
573 - /* Multiplier for the number of blank lines after the split */
574 - #define POST_BLANK_WEIGHT 6
575 -
576 - /*
577 - * Penalties applied if the line is indented more than its predecessor
578 - */
579 - #define RELATIVE_INDENT_PENALTY (-4)
580 - #define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
581 -
582 - /*
583 - * Penalties applied if the line is indented less than both its predecessor and
584 - * its successor
585 - */
586 - #define RELATIVE_OUTDENT_PENALTY 24
587 - #define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
588 -
589 - /*
590 - * Penalties applied if the line is indented less than its predecessor but not
591 - * less than its successor
592 - */
593 - #define RELATIVE_DEDENT_PENALTY 23
594 - #define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
595 -
596 - /*
597 - * We only consider whether the sum of the effective indents for splits are
598 - * less than (-1), equal to (0), or greater than (+1) each other. The resulting
599 - * value is multiplied by the following weight and combined with the penalty to
600 - * determine the better of two scores.
601 - */
602 - #define INDENT_WEIGHT 60
603 -
604 - /*
605 - * Compute a badness score for the hypothetical split whose measurements are
606 - * stored in m. The weight factors were determined empirically using the tools and
607 - * corpus described in
608 - *
609 - * https://github.com/mhagger/diff-slider-tools
610 - *
611 - * Also see that project if you want to improve the weights based on, for example,
612 - * a larger or more diverse corpus.
613 - */
614 ##### 2 static void score_add_split(const struct split_measurement *m, struct split_score *s)
615 - {
616 - /*
617 - * A place to accumulate penalty factors (positive makes this index more
618 - * favored):
619 - */
620 - int post_blank, total_blank, indent, any_blanks;
621 -
622 ##### 2,3 if (m->pre_indent == -1 && m->pre_blank == 0)
623 ##### 4 s->penalty += START_OF_FILE_PENALTY;
624 -
625 ##### 5 if (m->end_of_file)
626 ##### 6 s->penalty += END_OF_FILE_PENALTY;
627 -
628 - /*
629 - * Set post_blank to the number of blank lines following the split,
630 - * including the line immediately after the split:
631 - */
632 ##### 7-9 post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
633 ##### 10 total_blank = m->pre_blank + post_blank;
634 -
635 - /* Penalties based on nearby blank lines: */
636 ##### 10 s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
637 ##### 10 s->penalty += POST_BLANK_WEIGHT * post_blank;
638 -
639 ##### 10 if (m->indent != -1)
640 ##### 11 indent = m->indent;
641 - else
642 ##### 12 indent = m->post_indent;
643 -
644 ##### 13 any_blanks = (total_blank != 0);
645 -
646 - /* Note that the effective indent is -1 at the end of the file: */
647 ##### 13 s->effective_indent += indent;
648 -
649 ##### 13 if (indent == -1) {
650 - /* No additional adjustments needed. */
651 ##### 14 } else if (m->pre_indent == -1) {
652 - /* No additional adjustments needed. */
653 ##### 15 } else if (indent > m->pre_indent) {
654 - /*
655 - * The line is indented more than its predecessor.
656 - */
657 ##### 16,19 s->penalty += any_blanks ?
658 ##### 16-18 RELATIVE_INDENT_WITH_BLANK_PENALTY :
659 - RELATIVE_INDENT_PENALTY;
660 ##### 20 } else if (indent == m->pre_indent) {
661 - /*
662 - * The line has the same indentation level as its predecessor.
663 - * No additional adjustments needed.
664 - */
665 - } else {
666 - /*
667 - * The line is indented less than its predecessor. It could be
668 - * the block terminator of the previous block, but it could
669 - * also be the start of a new block (e.g., an "else" block, or
670 - * maybe the previous block didn't have a block terminator).
671 - * Try to distinguish those cases based on what comes next:
672 - */
673 ##### 21,22 if (m->post_indent != -1 && m->post_indent > indent) {
674 - /*
675 - * The following line is indented more. So it is likely
676 - * that this line is the start of a block.
677 - */
678 ##### 23,26 s->penalty += any_blanks ?
679 ##### 23-25 RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
680 - RELATIVE_OUTDENT_PENALTY;
681 - } else {
682 - /*
683 - * That was probably the end of a block.
684 - */
685 ##### 27 s->penalty += any_blanks ?
686 ##### 27-29 RELATIVE_DEDENT_WITH_BLANK_PENALTY :
687 - RELATIVE_DEDENT_PENALTY;
688 - }
689 - }
690 ##### 31 }
691 -
692 ##### 2 static int score_cmp(struct split_score *s1, struct split_score *s2)
693 - {
694 - /* -1 if s1.effective_indent < s2->effective_indent, etc. */
695 ##### 2 int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
696 ##### 2 (s1->effective_indent < s2->effective_indent));
697 -
698 ##### 2 return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
699 - }
700 -
701 - /*
702 - * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
703 - * of lines that was inserted or deleted from the corresponding version of the
704 - * file). We consider there to be such a group at the beginning of the file, at
705 - * the end of the file, and between any two unchanged lines, though most such
706 - * groups will usually be empty.
707 - *
708 - * If the first line in a group is equal to the line following the group, then
709 - * the group can be slid down. Similarly, if the last line in a group is equal
710 - * to the line preceding the group, then the group can be slid up. See
711 - * group_slide_down() and group_slide_up().
712 - *
713 - * Note that loops that are testing for changed lines in xdf->rchg do not need
714 - * index bounding since the array is prepared with a zero at position -1 and N.
715 - */
716 - struct xdlgroup {
717 - /*
718 - * The index of the first changed line in the group, or the index of
719 - * the unchanged line above which the (empty) group is located.
720 - */
721 - long start;
722 -
723 - /*
724 - * The index of the first unchanged line after the group. For an empty
725 - * group, end is equal to start.
726 - */
727 - long end;
728 - };
729 -
730 - /*
731 - * Initialize g to point at the first group in xdf.
732 - */
733 8492 2 static void group_init(xdfile_t *xdf, struct xdlgroup *g)
734 - {
735 8492 2 g->start = g->end = 0;
736 23515 2,4 while (xdf->rchg[g->end])
737 15023 3 g->end++;
738 8492 5 }
739 -
740 - /*
741 - * Move g to describe the next (possibly empty) group in xdf and return 0. If g
742 - * is already at the end of the file, do nothing and return -1.
743 - */
744 332691 2 XDL_INLINE(int) group_next(xdfile_t *xdf, struct xdlgroup *g)
745 - {
746 332691 2 if (g->end == xdf->nrec)
747 8492 3 return -1;
748 -
749 324199 4 g->start = g->end + 1;
750 332648 4-6 for (g->end = g->start; xdf->rchg[g->end]; g->end++)
751 - ;
752 -
753 324199 7 return 0;
754 - }
755 -
756 - /*
757 - * Move g to describe the previous (possibly empty) group in xdf and return 0.
758 - * If g is already at the beginning of the file, do nothing and return -1.
759 - */
760 2823 2 XDL_INLINE(int) group_previous(xdfile_t *xdf, struct xdlgroup *g)
761 - {
762 2823 2 if (g->start == 0)
763 ##### 3 return -1;
764 -
765 2823 4 g->end = g->start - 1;
766 3201 4-6 for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--)
767 - ;
768 -
769 2823 7 return 0;
770 - }
771 -
772 - /*
773 - * If g can be slid toward the end of the file, do so, and if it bumps into a
774 - * following group, expand this group to include it. Return 0 on success or -1
775 - * if g cannot be slid down.
776 - */
777 7000 2 static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g, long flags)
778 - {
779 7000 2,4 if (g->end < xdf->nrec &&
780 4353 3 recs_match(xdf->recs[g->start], xdf->recs[g->end], flags)) {
781 2855 5 xdf->rchg[g->start++] = 0;
782 2855 5 xdf->rchg[g->end++] = 1;
783 -
784 2864 5,7 while (xdf->rchg[g->end])
785 9 6 g->end++;
786 -
787 2855 8 return 0;
788 - } else {
789 4145 9 return -1;
790 - }
791 - }
792 -
793 - /*
794 - * If g can be slid toward the beginning of the file, do so, and if it bumps
795 - * into a previous group, expand this group to include it. Return 0 on success
796 - * or -1 if g cannot be slid up.
797 - */
798 6968 2 static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g, long flags)
799 - {
800 6968 2,4 if (g->start > 0 &&
801 4464 3 recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1], flags)) {
802 2823 5 xdf->rchg[--g->start] = 1;
803 2823 5 xdf->rchg[--g->end] = 0;
804 -
805 2961 5,7 while (xdf->rchg[g->start - 1])
806 138 6 g->start--;
807 -
808 2823 8 return 0;
809 - } else {
810 4145 9 return -1;
811 - }
812 - }
813 -
814 ##### 2 static void xdl_bug(const char *msg)
815 - {
816 ##### 2 fprintf(stderr, "BUG: %s\n", msg);
817 ##### 3 exit(1);
818 - }
819 -
820 - /*
821 - * Move back and forward change groups for a consistent and pretty diff output.
822 - * This also helps in finding joinable change groups and reducing the diff
823 - * size.
824 - */
825 4246 2 int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) {
826 - struct xdlgroup g, go;
827 - long earliest_end, end_matching_other;
828 - long groupsize;
829 -
830 4246 2 group_init(xdf, &g);
831 4246 3 group_init(xdfo, &go);
832 -
833 - while (1) {
834 - /* If the group is empty in the to-be-compacted file, skip it: */
835 164918 4 if (g.end == g.start)
836 160804 5 goto next;
837 -
838 - /*
839 - * Now shift the change up and then down as far as possible in
840 - * each direction. If it bumps into any other changes, merge them.
841 - */
842 - do {
843 4145 6 groupsize = g.end - g.start;
844 -
845 - /*
846 - * Keep track of the last "end" index that causes this
847 - * group to align with a group of changed lines in the
848 - * other file. -1 indicates that we haven't found such
849 - * a match yet:
850 - */
851 4145 6 end_matching_other = -1;
852 -
853 - /* Shift the group backward as much as possible: */
854 5918 6,10,11 while (!group_slide_up(xdf, &g, flags))
855 1773 7,8 if (group_previous(xdfo, &go))
856 ##### 9 xdl_bug("group sync broken sliding up");
857 -
858 - /*
859 - * This is this highest that this group can be shifted.
860 - * Record its end index:
861 - */
862 4145 12 earliest_end = g.end;
863 -
864 4145 12 if (go.end > go.start)
865 2910 13 end_matching_other = g.end;
866 -
867 - /* Now shift the group forward as far as possible: */
868 - while (1) {
869 7000 14,15 if (group_slide_down(xdf, &g, flags))
870 4145 16 break;
871 2855 17,18 if (group_next(xdfo, &go))
872 ##### 19 xdl_bug("group sync broken sliding down");
873 -
874 2855 20 if (go.end > go.start)
875 232 21 end_matching_other = g.end;
876 2855 22 }
877 4145 23 } while (groupsize != g.end - g.start);
878 -
879 - /*
880 - * If the group can be shifted, then we can possibly use this
881 - * freedom to produce a more intuitive diff.
882 - *
883 - * The group is currently shifted as far down as possible, so the
884 - * heuristics below only have to handle upwards shifts.
885 - */
886 -
887 4114 24 if (g.end == earliest_end) {
888 - /* no shifting was possible */
889 606 25 } else if (end_matching_other != -1) {
890 - /*
891 - * Move the possibly merged group of changes back to line
892 - * up with the last group of changes from the other file
893 - * that it can align with.
894 - */
895 1363 26,33 while (go.end == go.start) {
896 1050 27,28 if (group_slide_up(xdf, &g, flags))
897 ##### 29 xdl_bug("match disappeared");
898 1050 30,31 if (group_previous(xdfo, &go))
899 ##### 32 xdl_bug("group sync broken sliding to match");
900 - }
901 293 34 } else if (flags & XDF_INDENT_HEURISTIC) {
902 - /*
903 - * Indent heuristic: a group of pure add/delete lines
904 - * implies two splits, one between the end of the "before"
905 - * context and the start of the group, and another between
906 - * the end of the group and the beginning of the "after"
907 - * context. Some splits are aesthetically better and some
908 - * are worse. We compute a badness "score" for each split,
909 - * and add the scores for the two splits to define a
910 - * "score" for each position that the group can be shifted
911 - * to. Then we pick the shift with the lowest score.
912 - */
913 ##### 35 long shift, best_shift = -1;
914 - struct split_score best_score;
915 -
916 ##### 35,44,45 for (shift = earliest_end; shift <= g.end; shift++) {
917 - struct split_measurement m;
918 ##### 36 struct split_score score = {0, 0};
919 -
920 ##### 36 measure_split(xdf, shift, &m);
921 ##### 37 score_add_split(&m, &score);
922 ##### 38 measure_split(xdf, shift - groupsize, &m);
923 ##### 39 score_add_split(&m, &score);
924 ##### 40,42 if (best_shift == -1 ||
925 ##### 41 score_cmp(&score, &best_score) <= 0) {
926 ##### 43 best_score.effective_indent = score.effective_indent;
927 ##### 43 best_score.penalty = score.penalty;
928 ##### 43 best_shift = shift;
929 - }
930 - }
931 -
932 ##### 46,53,54 while (g.end > best_shift) {
933 ##### 47,48 if (group_slide_up(xdf, &g, flags))
934 ##### 49 xdl_bug("best shift unreached");
935 ##### 50,51 if (group_previous(xdfo, &go))
936 ##### 52 xdl_bug("group sync broken sliding to blank line");
937 - }
938 - }
939 -
940 - next:
941 - /* Move past the just-processed group: */
942 164918 55,56 if (group_next(xdf, &g))
943 4246 57 break;
944 160672 58,59 if (group_next(xdfo, &go))
945 ##### 60 xdl_bug("group sync broken moving to next group");
946 160672 61 }
947 -
948 4246 62,63 if (!group_next(xdfo, &go))
949 ##### 64 xdl_bug("group sync broken at end of file");
950 -
951 4246 65 return 0;
952 - }
953 -
954 -
955 2123 2 int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
956 2123 2 xdchange_t *cscr = NULL, *xch;
957 2123 2 char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg;
958 - long i1, i2, l1, l2;
959 -
960 - /*
961 - * Trivial. Collects "groups" of changes and creates an edit script.
962 - */
963 84598 2,16-18 for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
964 82475 3,4 if (rchg1[i1 - 1] || rchg2[i2 - 1]) {
965 7938 5-7 for (l1 = i1; rchg1[i1 - 1]; i1--);
966 8784 8-10 for (l2 = i2; rchg2[i2 - 1]; i2--);
967 -
968 2570 11,12 if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
969 ##### 13 xdl_free_script(cscr);
970 ##### 14 return -1;
971 - }
972 2570 15 cscr = xch;
973 - }
974 -
975 2123 19 *xscr = cscr;
976 -
977 2123 19 return 0;
978 - }
979 -
980 -
981 2120 2 void xdl_free_script(xdchange_t *xscr) {
982 - xdchange_t *xch;
983 -
984 4690 2,4 while ((xch = xscr) != NULL) {
985 2570 3 xscr = xscr->next;
986 2570 3 xdl_free(xch);
987 - }
988 2120 5 }
989 -
990 188 2 static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
991 - xdemitconf_t const *xecfg)
992 - {
993 - xdchange_t *xch, *xche;
994 -
995 - (void)xe;
996 -
997 404 2,9,10 for (xch = xscr; xch; xch = xche->next) {
998 216 3 xche = xdl_get_hunk(&xch, xecfg);
999 216 4 if (!xch)
1000 ##### 5 break;
1001 216 6,6,6,7 if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
1002 216 6,6 xch->i2, xche->i2 + xche->chg2 - xch->i2,
1003 - ecb->priv) < 0)
1004 ##### 8 return -1;
1005 - }
1006 188 11 return 0;
1007 - }
1008 -
1009 ##### 2 static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
1010 - {
1011 - xdchange_t *xch;
1012 -
1013 ##### 2,13,14 for (xch = xscr; xch; xch = xch->next) {
1014 ##### 3 int ignore = 1;
1015 - xrecord_t **rec;
1016 - long i;
1017 -
1018 ##### 3 rec = &xe->xdf1.recs[xch->i1];
1019 ##### 3,5-7 for (i = 0; i < xch->chg1 && ignore; i++)
1020 ##### 4 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1021 -
1022 ##### 8 rec = &xe->xdf2.recs[xch->i2];
1023 ##### 8,10-12 for (i = 0; i < xch->chg2 && ignore; i++)
1024 ##### 9 ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1025 -
1026 ##### 13 xch->ignore = ignore;
1027 - }
1028 ##### 15 }
1029 -
1030 1020 2 int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
1031 - xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
1032 - xdchange_t *xscr;
1033 - xdfenv_t xe;
1034 1020 2-4 emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
1035 -
1036 1020 5,6 if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
1037 -
1038 ##### 7 return -1;
1039 - }
1040 1020 8,9,11 if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 ||
1041 1020 10,13 xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 ||
1042 1020 12 xdl_build_script(&xe, &xscr) < 0) {
1043 -
1044 ##### 14 xdl_free_env(&xe);
1045 ##### 15 return -1;
1046 - }
1047 1020 16 if (xscr) {
1048 1017 17 if (xpp->flags & XDF_IGNORE_BLANK_LINES)
1049 ##### 18 xdl_mark_ignorable(xscr, &xe, xpp->flags);
1050 -
1051 1017 19,20 if (ef(&xe, xscr, ecb, xecfg) < 0) {
1052 -
1053 ##### 21 xdl_free_script(xscr);
1054 ##### 22 xdl_free_env(&xe);
1055 ##### 23 return -1;
1056 - }
1057 1017 24 xdl_free_script(xscr);
1058 - }
1059 1020 25 xdl_free_env(&xe);
1060 -
1061 1020 26 return 0;
1062 - }